1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[MachineScheduler] Fix the TopDepth/BotHeightReduce latency heuristics

tryLatency compares two sched candidates. For the top zone it prefers
the one with lesser depth, but only if that depth is greater than the
total latency of the instructions we've already scheduled -- otherwise
its latency would be hidden and there would be no stall.

Unfortunately it only tests the depth of one of the candidates. This can
lead to situations where the TopDepthReduce heuristic does not kick in,
but a lower priority heuristic chooses the other candidate, whose depth
*is* greater than the already scheduled latency, which causes a stall.

The fix is to apply the heuristic if the depth of *either* candidate is
greater than the already scheduled latency.

All this also applies to the BotHeightReduce heuristic in the bottom
zone.

Differential Revision: https://reviews.llvm.org/D72392
This commit is contained in:
Jay Foad 2020-01-07 15:43:46 +00:00
parent 42cb8128e2
commit 3f23d4b8c3
114 changed files with 3531 additions and 3428 deletions

View File

@ -2724,7 +2724,11 @@ bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
GenericSchedulerBase::SchedCandidate &Cand,
SchedBoundary &Zone) {
if (Zone.isTop()) {
if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
// Prefer the candidate with the lesser depth, but only if one of them has
// depth greater than the total latency scheduled so far, otherwise either
// of them could be scheduled now with no stall.
if (std::max(TryCand.SU->getDepth(), Cand.SU->getDepth()) >
Zone.getScheduledLatency()) {
if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
TryCand, Cand, GenericSchedulerBase::TopDepthReduce))
return true;
@ -2733,7 +2737,11 @@ bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
TryCand, Cand, GenericSchedulerBase::TopPathReduce))
return true;
} else {
if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
// Prefer the candidate with the lesser height, but only if one of them has
// height greater than the total latency scheduled so far, otherwise either
// of them could be scheduled now with no stall.
if (std::max(TryCand.SU->getHeight(), Cand.SU->getHeight()) >
Zone.getScheduledLatency()) {
if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
TryCand, Cand, GenericSchedulerBase::BotHeightReduce))
return true;

View File

@ -27,10 +27,10 @@ entry:
; NONE16: fmov s1, wzr
; NONE16: fmov d2, xzr
; NONE16: movi{{(.16b)?}} v3{{(.2d)?}}, #0
; ZEROFP: ldr h0,{{.*}}
; ZEROFP: movi v{{[0-3]+}}.2d, #0
; ZEROFP: movi v{{[0-3]+}}.2d, #0
; ZEROFP: movi v{{[0-3]+}}.2d, #0
; ZEROFP-DAG: ldr h0,{{.*}}
; ZEROFP-DAG: movi v{{[0-3]+}}.2d, #0
; ZEROFP-DAG: movi v{{[0-3]+}}.2d, #0
; ZEROFP-DAG: movi v{{[0-3]+}}.2d, #0
; ZERO16: movi v{{[0-3]+}}.2d, #0
; ZERO16: movi v{{[0-3]+}}.2d, #0
; ZERO16: movi v{{[0-3]+}}.2d, #0

View File

@ -590,21 +590,21 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
; SI-NEXT: s_movk_i32 s6, 0xff
; SI-NEXT: s_movk_i32 s0, 0xff
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v1, s6, v2
; SI-NEXT: v_and_b32_e32 v1, s0, v2
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v2, s6, v3
; SI-NEXT: v_and_b32_e32 v2, s0, v3
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v3, s6, v4
; SI-NEXT: v_and_b32_e32 v3, s0, v4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v4, s6, v0
; SI-NEXT: v_and_b32_e32 v4, s0, v0
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v4
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@ -839,21 +839,21 @@ define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* no
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:3
; SI-NEXT: s_movk_i32 s6, 0xff
; SI-NEXT: s_movk_i32 s0, 0xff
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_and_b32_e32 v1, s6, v2
; SI-NEXT: v_and_b32_e32 v1, s0, v2
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_and_b32_e32 v2, s6, v3
; SI-NEXT: v_and_b32_e32 v2, s0, v3
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_and_b32_e32 v3, s6, v4
; SI-NEXT: v_and_b32_e32 v3, s0, v4
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v4, s6, v0
; SI-NEXT: v_and_b32_e32 v4, s0, v0
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v1
; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v2
; SI-NEXT: v_cvt_f32_ubyte0_e32 v2, v3
; SI-NEXT: v_cvt_f32_ubyte0_e32 v3, v4
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;

View File

@ -141,18 +141,18 @@ define void @constrained_if_register_class() {
; CHECK-NEXT: s_cmp_lg_u32 s4, 0
; CHECK-NEXT: s_cbranch_scc0 BB4_6
; CHECK-NEXT: ; %bb.1: ; %bb2
; CHECK-NEXT: s_getpc_b64 s[6:7]
; CHECK-NEXT: s_add_u32 s6, s6, const.ptr@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s7, s7, const.ptr@gotpcrel32@hi+4
; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
; CHECK-NEXT: s_mov_b32 s4, -1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s6
; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: flat_load_dword v0, v[0:1]
; CHECK-NEXT: s_getpc_b64 s[4:5]
; CHECK-NEXT: s_add_u32 s4, s4, const.ptr@gotpcrel32@lo+4
; CHECK-NEXT: s_addc_u32 s5, s5, const.ptr@gotpcrel32@hi+4
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CHECK-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, 1
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s4
; CHECK-NEXT: v_mov_b32_e32 v1, s5
; CHECK-NEXT: flat_load_dword v0, v[0:1]
; CHECK-NEXT: s_mov_b32 s4, -1
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_cmp_gt_f32_e32 vcc, 1.0, v0
; CHECK-NEXT: s_xor_b64 s[8:9], vcc, s[6:7]

View File

@ -1555,40 +1555,40 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(i32 addrspace(1)* %out0,
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v3, s4
; CI-NEXT: ds_inc_rtn_u32 v4, v3, v2
; CI-NEXT: ds_inc_rtn_u32 v5, v3, v2
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v1, s4
; CI-NEXT: ds_inc_rtn_u32 v4, v1, v0
; CI-NEXT: ds_inc_rtn_u32 v5, v1, v0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: s_waitcnt lgkmcnt(1)
; CI-NEXT: flat_store_dword v[2:3], v4
; CI-NEXT: flat_store_dword v[0:1], v4
; CI-NEXT: s_waitcnt lgkmcnt(1)
; CI-NEXT: flat_store_dword v[0:1], v5
; CI-NEXT: flat_store_dword v[2:3], v5
; CI-NEXT: s_endpgm
;
; VI-LABEL: nocse_lds_atomic_inc_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v3, s4
; VI-NEXT: ds_inc_rtn_u32 v4, v3, v2
; VI-NEXT: ds_inc_rtn_u32 v5, v3, v2
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: ds_inc_rtn_u32 v4, v1, v0
; VI-NEXT: ds_inc_rtn_u32 v5, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt lgkmcnt(1)
; VI-NEXT: flat_store_dword v[2:3], v4
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: s_waitcnt lgkmcnt(1)
; VI-NEXT: flat_store_dword v[0:1], v5
; VI-NEXT: flat_store_dword v[2:3], v5
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: nocse_lds_atomic_inc_ret_i32:

View File

@ -848,17 +848,17 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace
; GFX7-NEXT: s_mov_b64 s[0:1], s[6:7]
; GFX7-NEXT: buffer_load_dword v3, v[1:2], s[0:3], 0 addr64
; GFX7-NEXT: buffer_load_dword v4, v[1:2], s[0:3], 0 addr64 offset:4
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64 offset:8
; GFX7-NEXT: s_cmp_lg_u32 s8, 0
; GFX7-NEXT: s_cselect_b32 s6, 1, 0
; GFX7-NEXT: s_and_b32 s0, 1, s6
; GFX7-NEXT: s_cselect_b32 s0, 1, 0
; GFX7-NEXT: s_and_b32 s0, 1, s0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_and_b64 vcc, vcc, s[0:1]
; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_div_fmas_f32 v0, v3, v4, v0
; GFX7-NEXT: v_div_fmas_f32 v0, v3, v4, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
; GFX7-NEXT: s_endpgm
;

View File

@ -950,21 +950,21 @@ define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out)
define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
; GFX6-LABEL: simplify_bfe_u32_multi_use_arg:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
; GFX6-NEXT: s_mov_b32 s6, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b64 s[10:11], s[6:7]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
; GFX6-NEXT: s_load_dword s8, s[2:3], 0x0
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_and_b32 s0, s0, 63
; GFX6-NEXT: s_bfe_u32 s1, s0, 0x20002
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: s_and_b32 s8, s8, 63
; GFX6-NEXT: s_bfe_u32 s9, s8, 0x20002
; GFX6-NEXT: v_mov_b32_e32 v1, s9
; GFX6-NEXT: v_mov_b32_e32 v0, s8
; GFX6-NEXT: buffer_store_dword v1, off, s[4:7], 0
; GFX6-NEXT: buffer_store_dword v0, off, s[8:11], 0
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
i32 addrspace(1)* %out1,
i32 addrspace(1)* %in) #0 {

View File

@ -20,53 +20,53 @@ define <3 x i32> @v_load_constant_v3i32_align1(<3 x i32> addrspace(4)* %ptr) {
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 11, v0
; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-6
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-5
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-4
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-3
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off offset:-2
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-1
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[2:3], off offset:-6
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[2:3], off offset:-5
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[2:3], off offset:-4
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[2:3], off offset:-3
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[2:3], off offset:-2
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[2:3], off offset:-1
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[2:3], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v[2:3], off offset:-10
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v13, v[2:3], off offset:-9
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v14, v[2:3], off offset:-8
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[2:3], off offset:-9
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[2:3], off offset:-8
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[2:3], off offset:-7
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, 0xff
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xff
; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8
; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 8
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11)
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v6, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v4, v13, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v4
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v3
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v4
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v3
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7)
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v5, v5, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v8, v13, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v11, v4
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v3
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v12, v4
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v10, v3
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v13
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, s4, v11
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v13, s4, v14
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, s4, v12
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v4, v6
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v3, v4
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v13
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v7
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v8
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v4, v9, v4, v5
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v10
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v11
; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v3
; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v2, v6, v7
; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v4, v5, v8
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v11
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v12
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v6
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v3, v7, v3, v8
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v9
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v10
; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v11
; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v2, v4, v5
; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v3, v6, v7
; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
@ -85,61 +85,62 @@ define <3 x i32> @v_load_constant_v3i32_align1(<3 x i32> addrspace(4)* %ptr) {
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s6, 0
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NOUNALIGNED-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:5
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:6
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:7
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:8
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:9
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:10
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:11
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, v[0:1], s[4:7], 0 addr64
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64 offset:1
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:2
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v13, v[0:1], s[4:7], 0 addr64 offset:3
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:4
; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, 0xff
; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s8, 0xff
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v2, v[0:1], s[4:7], 0 addr64
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:1
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:3
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, v[0:1], s[4:7], 0 addr64 offset:4
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, v[0:1], s[4:7], 0 addr64 offset:5
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, v[0:1], s[4:7], 0 addr64 offset:6
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, v[0:1], s[4:7], 0 addr64 offset:7
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, v[0:1], s[4:7], 0 addr64 offset:8
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, v[0:1], s[4:7], 0 addr64 offset:9
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v12, v[0:1], s[4:7], 0 addr64 offset:10
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 offset:11
; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, 0xff
; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, v3, v2
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v2
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v4, v2
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v3
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v2
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v2
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v2
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, s4, v6
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v2
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v9, v2
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s8, v10
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, s8, v11
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v10, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, s8, v12
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 8, v10
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v12, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s8, v0
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 8, v7
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v0, v1
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v3
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 8, v7
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 16, v8
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v9
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 8, v11
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 16, v12
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v12, 24, v0
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v2, v1
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v6, v5
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v10, v9
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v3
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v12, s8, v13
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 16, v11
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v9
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v3, v6, v7
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v12
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v5
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v3, v3, v8
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v10
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v0, v4
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v1, v11
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v5
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v3, v2
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v7
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v2, v11
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v4
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v8
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v2, v12
; GFX7-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
%load = load <3 x i32>, <3 x i32> addrspace(4)* %ptr, align 1
ret <3 x i32> %load
@ -158,27 +159,27 @@ define <3 x i32> @v_load_constant_v3i32_align2(<3 x i32> addrspace(4)* %ptr) {
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NOUNALIGNED-NEXT: v_add_co_u32_e32 v2, vcc, 10, v0
; GFX9-NOUNALIGNED-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[2:3], off
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v[2:3], off offset:-8
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-6
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v7, v[2:3], off offset:-4
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[2:3], off offset:-6
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[2:3], off offset:-4
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[2:3], off offset:-2
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v4
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v4, v3
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, v7, v4
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v3
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v6, v4, v3
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v5, v3, v6
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v4, v5
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, v3, v4
; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
@ -203,18 +204,18 @@ define <3 x i32> @v_load_constant_v3i32_align2(<3 x i32> addrspace(4)* %ptr) {
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, v[0:1], s[4:7], 0 addr64 offset:4
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v6, v[0:1], s[4:7], 0 addr64 offset:6
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:8
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s8, 0xffff
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s8, v3
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v3
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s8, v4
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v4
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s8, v5
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v5
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s8, v6
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v6
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, s8, v0
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s8, v2
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, s4, v0
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v2
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v0
@ -432,58 +433,58 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)*
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v16, s2
; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 9
; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v20, v[10:11], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[12:13], off
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v19, s3
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v18, s2
; GFX9-NOUNALIGNED-NEXT: s_add_u32 s2, s0, 10
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s3, s1, 0
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NOUNALIGNED-NEXT: s_add_u32 s0, s0, 11
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v13, v[14:15], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v14, v[16:17], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v15, v[10:11], off
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v20, v[10:11], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v21, v[12:13], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v14, v[14:15], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v15, v[16:17], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[18:19], off
; GFX9-NOUNALIGNED-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v16, v[10:11], off
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s1
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s0
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v11, s3
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, s1
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v10, s2
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, s0
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[10:11], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[12:13], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v[2:3], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[4:5], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[6:7], off
; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[8:9], off
; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8
; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v18, 0xff
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v19, 8
; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8
; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xff
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v6, 8
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v2
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v2
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v3
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v3
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2
; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v0, v1, v2
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v12, v18
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v13, v18
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v21, v5
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v14, v5
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v19, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v4, v18, v0
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v4, v5, v0
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2
; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v0, v1, v2
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v19, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v10, v18
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v0, v6, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v11, v5
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v16, v18
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v14, v18, v0
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v10, v5
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v15, v5, v0
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2
; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v0, v1, v2
@ -508,60 +509,59 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)*
; GFX7-NOUNALIGNED: ; %bb.0:
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s2, -1
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:5
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:6
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:7
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:8
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:9
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:10
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:11
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, off, s[0:3], 0
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 offset:1
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:2
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:3
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v12, off, s[0:3], 0 offset:4
; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff
; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:1
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:2
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 offset:3
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v4, off, s[0:3], 0 offset:4
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v5, off, s[0:3], 0 offset:5
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v6, off, s[0:3], 0 offset:6
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v7, off, s[0:3], 0 offset:7
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v8, off, s[0:3], 0 offset:8
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v9, off, s[0:3], 0 offset:9
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v10, off, s[0:3], 0 offset:10
; GFX7-NOUNALIGNED-NEXT: buffer_load_ubyte v11, off, s[0:3], 0 offset:11
; GFX7-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, 0xff
; GFX7-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(11)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v1, v0
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v0
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, v3, v0
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, v4, v0
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v0
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v0
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, s4, v8
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v9, s4, v9
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v7, v0
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, s4, v10
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 8, v9
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, s4, v11
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v12, s4, v12
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 8, v1
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v10
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 24, v11
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v0
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v8, v7
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(7)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v12
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v12
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v12
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v12
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v10, v10, v12
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 8, v9
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v12
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v12
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v12, v1
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v4, v5
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v4, v5
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v9
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v2
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v6
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 16, v10
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v4, v8, v9
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v2
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v3
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v10
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v3
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v7
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v6
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 24, v11
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v4, v10
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v0, v0, v3
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v1, v1, v7
; GFX7-NOUNALIGNED-NEXT: v_or_b32_e32 v2, v2, v11
; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
@ -613,21 +613,21 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)*
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[4:5], off
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v[6:7], off
; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[8:9], off
; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v12, 0xffff
; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff
; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s4, v1
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v0, s0, v1
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v3, v12
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v3, v5
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v2, v12, v0
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v10, v12
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v2, v5, v0
; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, v10, v5
; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v4, v12, v0
; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v4, v5, v0
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
; GFX9-NOUNALIGNED-NEXT: ; return to shader part epilog
@ -656,19 +656,19 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)*
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:4
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6
; GFX7-NOUNALIGNED-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff
; GFX7-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v0
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v0
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v1
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(3)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v2
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v2
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v3
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4
; GFX7-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0)
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5
; GFX7-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s0, v5
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v0
; GFX7-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4

View File

@ -96,8 +96,8 @@ define amdgpu_kernel void @localize_globals(i1 %cond) {
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, gv3@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, gv3@gotpcrel32@hi+4
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_mov_b32 s0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@ -116,10 +116,10 @@ define amdgpu_kernel void @localize_globals(i1 %cond) {
; GFX9-NEXT: s_getpc_b64 s[0:1]
; GFX9-NEXT: s_add_u32 s0, s0, gv0@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s1, s1, gv0@gotpcrel32@hi+4
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-NEXT: s_getpc_b64 s[2:3]
; GFX9-NEXT: s_add_u32 s2, s2, gv1@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s3, s3, gv1@gotpcrel32@hi+4
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX9-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 1

View File

@ -162,9 +162,9 @@ define i96 @zextload_global_i32_to_i96(i32 addrspace(1)* %ptr) {
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b32 s8, 0
; GFX6-NEXT: s_mov_b32 s4, 0
; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%load = load i32, i32 addrspace(1)* %ptr
@ -180,8 +180,8 @@ define i128 @zextload_global_i32_to_i128(i32 addrspace(1)* %ptr) {
; GFX9-NEXT: s_mov_b32 s4, 0
; GFX9-NEXT: s_mov_b32 s5, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, s5
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
@ -192,8 +192,8 @@ define i128 @zextload_global_i32_to_i128(i32 addrspace(1)* %ptr) {
; GFX8-NEXT: s_mov_b32 s4, 0
; GFX8-NEXT: s_mov_b32 s5, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: v_mov_b32_e32 v2, s5
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@ -204,11 +204,11 @@ define i128 @zextload_global_i32_to_i128(i32 addrspace(1)* %ptr) {
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b32 s8, 0
; GFX6-NEXT: s_mov_b32 s4, 0
; GFX6-NEXT: s_mov_b32 s5, 0
; GFX6-NEXT: v_mov_b32_e32 v1, 0
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s8
; GFX6-NEXT: v_mov_b32_e32 v3, s5
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%load = load i32, i32 addrspace(1)* %ptr

View File

@ -105,8 +105,8 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)*
; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1:
; GFX9: v_pk_sub_u16 v{{[0-9]+}}, v{{[0-9]+}}, 1 op_sel_hi:[1,0]{{$}}
; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1
; VI: flat_load_dword [[LOAD:v[0-9]+]]
; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1
; VI-DAG: flat_load_dword [[LOAD:v[0-9]+]]
; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[LOAD]], v[[SCONST]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD]]
; VI: v_or_b32_e32

View File

@ -5059,16 +5059,16 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
;
; GCN-LABEL: udiv_i64_pow2k_denom:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_lshr_b64 s[4:5], s[6:7], 12
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_lshr_b64 s[0:1], s[2:3], 12
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%r = udiv i64 %x, 4096
store i64 %r, i64 addrspace(1)* %out
@ -5703,20 +5703,20 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(i64 addrspace(1)* %out, i64 %x)
;
; GCN-LABEL: sdiv_i64_pow2k_denom:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_ashr_i32 s4, s7, 31
; GCN-NEXT: s_lshr_b32 s4, s4, 20
; GCN-NEXT: s_add_u32 s4, s6, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_addc_u32 s5, s7, 0
; GCN-NEXT: s_ashr_i64 s[4:5], s[4:5], 12
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_ashr_i32 s0, s3, 31
; GCN-NEXT: s_lshr_b32 s0, s0, 20
; GCN-NEXT: s_add_u32 s0, s2, s0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: s_ashr_i64 s[0:1], s[0:1], 12
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
%r = sdiv i64 %x, 4096
store i64 %r, i64 addrspace(1)* %out

View File

@ -339,35 +339,34 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrsp
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_mov_b32 s6, 0xff00ff
; SI-NEXT: s_mov_b32 s8, 0xf0f0f0f
; SI-NEXT: s_mov_b32 s9, 0xf0f0f0f0
; SI-NEXT: s_mov_b32 s10, 0x33333333
; SI-NEXT: s_mov_b32 s11, 0xcccccccc
; SI-NEXT: s_mov_b32 s0, 0x55555555
; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa
; SI-NEXT: s_mov_b32 s0, 0xff00ff
; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f
; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0
; SI-NEXT: s_mov_b32 s3, 0x33333333
; SI-NEXT: s_mov_b32 s6, 0xcccccccc
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v2, v0, v0, 8
; SI-NEXT: v_alignbit_b32 v0, v0, v0, 24
; SI-NEXT: v_alignbit_b32 v3, v1, v1, 8
; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
; SI-NEXT: v_bfi_b32 v2, s6, v0, v2
; SI-NEXT: v_bfi_b32 v4, s6, v1, v3
; SI-NEXT: v_and_b32_e32 v1, s8, v2
; SI-NEXT: v_and_b32_e32 v0, s8, v4
; SI-NEXT: v_and_b32_e32 v3, s9, v2
; SI-NEXT: v_and_b32_e32 v2, s9, v4
; SI-NEXT: v_bfi_b32 v2, s0, v0, v2
; SI-NEXT: v_bfi_b32 v4, s0, v1, v3
; SI-NEXT: v_and_b32_e32 v1, s1, v2
; SI-NEXT: v_and_b32_e32 v0, s1, v4
; SI-NEXT: v_and_b32_e32 v3, s2, v2
; SI-NEXT: v_and_b32_e32 v2, s2, v4
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s0, 0x55555555
; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_and_b32_e32 v1, s10, v3
; SI-NEXT: v_and_b32_e32 v0, s10, v2
; SI-NEXT: v_and_b32_e32 v3, s11, v3
; SI-NEXT: v_and_b32_e32 v2, s11, v2
; SI-NEXT: v_and_b32_e32 v1, s3, v3
; SI-NEXT: v_and_b32_e32 v0, s3, v2
; SI-NEXT: v_and_b32_e32 v3, s6, v3
; SI-NEXT: v_and_b32_e32 v2, s6, v2
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2
; SI-NEXT: s_mov_b32 s1, 0xaaaaaaaa
; SI-NEXT: v_or_b32_e32 v3, v3, v1
; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_and_b32_e32 v1, s0, v3
@ -376,6 +375,7 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrsp
; SI-NEXT: v_and_b32_e32 v2, s1, v2
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: v_or_b32_e32 v1, v3, v1
; SI-NEXT: v_or_b32_e32 v0, v2, v0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@ -386,33 +386,33 @@ define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrsp
; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; FLAT-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; FLAT-NEXT: s_mov_b32 s6, 0x10203
; FLAT-NEXT: s_mov_b32 s2, 0x33333333
; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc
; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0
; FLAT-NEXT: s_mov_b32 s3, 0x33333333
; FLAT-NEXT: s_mov_b32 s6, 0xcccccccc
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT-NEXT: v_mov_b32_e32 v1, s1
; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; FLAT-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f
; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0
; FLAT-NEXT: s_mov_b32 s0, 0x10203
; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f
; FLAT-NEXT: s_mov_b32 s7, 0xf000
; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; FLAT-NEXT: v_perm_b32 v2, 0, v0, s6
; FLAT-NEXT: v_perm_b32 v4, 0, v1, s6
; FLAT-NEXT: v_and_b32_e32 v1, s0, v2
; FLAT-NEXT: v_and_b32_e32 v0, s0, v4
; FLAT-NEXT: v_and_b32_e32 v3, s1, v2
; FLAT-NEXT: v_and_b32_e32 v2, s1, v4
; FLAT-NEXT: v_perm_b32 v2, 0, v0, s0
; FLAT-NEXT: v_perm_b32 v4, 0, v1, s0
; FLAT-NEXT: v_and_b32_e32 v1, s1, v2
; FLAT-NEXT: v_and_b32_e32 v0, s1, v4
; FLAT-NEXT: v_and_b32_e32 v3, s2, v2
; FLAT-NEXT: v_and_b32_e32 v2, s2, v4
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
; FLAT-NEXT: s_mov_b32 s0, 0x55555555
; FLAT-NEXT: v_or_b32_e32 v3, v3, v1
; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
; FLAT-NEXT: v_and_b32_e32 v1, s2, v3
; FLAT-NEXT: v_and_b32_e32 v0, s2, v2
; FLAT-NEXT: v_and_b32_e32 v3, s3, v3
; FLAT-NEXT: v_and_b32_e32 v2, s3, v2
; FLAT-NEXT: v_and_b32_e32 v1, s3, v3
; FLAT-NEXT: v_and_b32_e32 v0, s3, v2
; FLAT-NEXT: v_and_b32_e32 v3, s6, v3
; FLAT-NEXT: v_and_b32_e32 v2, s6, v2
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3]
; FLAT-NEXT: s_mov_b32 s1, 0xaaaaaaaa
@ -600,13 +600,13 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_mov_b32 s8, 0xff00ff
; SI-NEXT: s_mov_b32 s9, 0xf0f0f0f
; SI-NEXT: s_mov_b32 s10, 0xf0f0f0f0
; SI-NEXT: s_mov_b32 s11, 0x33333333
; SI-NEXT: s_mov_b32 s12, 0xcccccccc
; SI-NEXT: s_mov_b32 s13, 0x55555555
; SI-NEXT: s_mov_b32 s14, 0xaaaaaaaa
; SI-NEXT: s_mov_b32 s0, 0xff00ff
; SI-NEXT: s_mov_b32 s1, 0xf0f0f0f
; SI-NEXT: s_mov_b32 s2, 0xf0f0f0f0
; SI-NEXT: s_mov_b32 s3, 0x33333333
; SI-NEXT: s_mov_b32 s8, 0xcccccccc
; SI-NEXT: s_mov_b32 s9, 0x55555555
; SI-NEXT: s_mov_b32 s10, 0xaaaaaaaa
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_alignbit_b32 v4, v2, v2, 8
@ -617,18 +617,18 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; SI-NEXT: v_alignbit_b32 v7, v1, v1, 8
; SI-NEXT: v_alignbit_b32 v1, v1, v1, 24
; SI-NEXT: v_alignbit_b32 v3, v3, v3, 24
; SI-NEXT: v_bfi_b32 v2, s8, v2, v4
; SI-NEXT: v_bfi_b32 v4, s8, v3, v5
; SI-NEXT: v_bfi_b32 v6, s8, v0, v6
; SI-NEXT: v_bfi_b32 v8, s8, v1, v7
; SI-NEXT: v_and_b32_e32 v1, s9, v2
; SI-NEXT: v_and_b32_e32 v0, s9, v4
; SI-NEXT: v_and_b32_e32 v3, s10, v2
; SI-NEXT: v_and_b32_e32 v2, s10, v4
; SI-NEXT: v_and_b32_e32 v5, s9, v6
; SI-NEXT: v_and_b32_e32 v4, s9, v8
; SI-NEXT: v_and_b32_e32 v7, s10, v6
; SI-NEXT: v_and_b32_e32 v6, s10, v8
; SI-NEXT: v_bfi_b32 v2, s0, v2, v4
; SI-NEXT: v_bfi_b32 v4, s0, v3, v5
; SI-NEXT: v_bfi_b32 v6, s0, v0, v6
; SI-NEXT: v_bfi_b32 v8, s0, v1, v7
; SI-NEXT: v_and_b32_e32 v1, s1, v2
; SI-NEXT: v_and_b32_e32 v0, s1, v4
; SI-NEXT: v_and_b32_e32 v3, s2, v2
; SI-NEXT: v_and_b32_e32 v2, s2, v4
; SI-NEXT: v_and_b32_e32 v5, s1, v6
; SI-NEXT: v_and_b32_e32 v4, s1, v8
; SI-NEXT: v_and_b32_e32 v7, s2, v6
; SI-NEXT: v_and_b32_e32 v6, s2, v8
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 4
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 4
; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 4
@ -637,14 +637,14 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_or_b32_e32 v7, v7, v5
; SI-NEXT: v_or_b32_e32 v6, v6, v4
; SI-NEXT: v_and_b32_e32 v1, s11, v3
; SI-NEXT: v_and_b32_e32 v0, s11, v2
; SI-NEXT: v_and_b32_e32 v5, s11, v7
; SI-NEXT: v_and_b32_e32 v4, s11, v6
; SI-NEXT: v_and_b32_e32 v3, s12, v3
; SI-NEXT: v_and_b32_e32 v2, s12, v2
; SI-NEXT: v_and_b32_e32 v7, s12, v7
; SI-NEXT: v_and_b32_e32 v6, s12, v6
; SI-NEXT: v_and_b32_e32 v1, s3, v3
; SI-NEXT: v_and_b32_e32 v0, s3, v2
; SI-NEXT: v_and_b32_e32 v5, s3, v7
; SI-NEXT: v_and_b32_e32 v4, s3, v6
; SI-NEXT: v_and_b32_e32 v3, s8, v3
; SI-NEXT: v_and_b32_e32 v2, s8, v2
; SI-NEXT: v_and_b32_e32 v7, s8, v7
; SI-NEXT: v_and_b32_e32 v6, s8, v6
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 2
; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 2
@ -653,14 +653,14 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; SI-NEXT: v_or_b32_e32 v2, v2, v0
; SI-NEXT: v_or_b32_e32 v7, v7, v5
; SI-NEXT: v_or_b32_e32 v6, v6, v4
; SI-NEXT: v_and_b32_e32 v1, s13, v3
; SI-NEXT: v_and_b32_e32 v0, s13, v2
; SI-NEXT: v_and_b32_e32 v5, s13, v7
; SI-NEXT: v_and_b32_e32 v4, s13, v6
; SI-NEXT: v_and_b32_e32 v3, s14, v3
; SI-NEXT: v_and_b32_e32 v2, s14, v2
; SI-NEXT: v_and_b32_e32 v7, s14, v7
; SI-NEXT: v_and_b32_e32 v6, s14, v6
; SI-NEXT: v_and_b32_e32 v1, s9, v3
; SI-NEXT: v_and_b32_e32 v0, s9, v2
; SI-NEXT: v_and_b32_e32 v5, s9, v7
; SI-NEXT: v_and_b32_e32 v4, s9, v6
; SI-NEXT: v_and_b32_e32 v3, s10, v3
; SI-NEXT: v_and_b32_e32 v2, s10, v2
; SI-NEXT: v_and_b32_e32 v7, s10, v7
; SI-NEXT: v_and_b32_e32 v6, s10, v6
; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
; SI-NEXT: v_lshr_b64 v[2:3], v[2:3], 1
; SI-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
@ -677,33 +677,33 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; FLAT-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; FLAT-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; FLAT-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; FLAT-NEXT: s_mov_b32 s10, 0x10203
; FLAT-NEXT: s_mov_b32 s2, 0x33333333
; FLAT-NEXT: s_mov_b32 s3, 0xcccccccc
; FLAT-NEXT: s_mov_b32 s2, 0xf0f0f0f0
; FLAT-NEXT: s_mov_b32 s3, 0x33333333
; FLAT-NEXT: s_mov_b32 s8, 0xcccccccc
; FLAT-NEXT: s_waitcnt lgkmcnt(0)
; FLAT-NEXT: v_mov_b32_e32 v1, s1
; FLAT-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; FLAT-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; FLAT-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; FLAT-NEXT: s_mov_b32 s0, 0xf0f0f0f
; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f0
; FLAT-NEXT: s_mov_b32 s8, 0x55555555
; FLAT-NEXT: s_mov_b32 s9, 0xaaaaaaaa
; FLAT-NEXT: s_mov_b32 s0, 0x10203
; FLAT-NEXT: s_mov_b32 s1, 0xf0f0f0f
; FLAT-NEXT: s_mov_b32 s9, 0x55555555
; FLAT-NEXT: s_mov_b32 s10, 0xaaaaaaaa
; FLAT-NEXT: s_mov_b32 s7, 0xf000
; FLAT-NEXT: s_mov_b32 s6, -1
; FLAT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; FLAT-NEXT: v_perm_b32 v6, 0, v0, s10
; FLAT-NEXT: v_perm_b32 v4, 0, v3, s10
; FLAT-NEXT: v_perm_b32 v2, 0, v2, s10
; FLAT-NEXT: v_perm_b32 v8, 0, v1, s10
; FLAT-NEXT: v_and_b32_e32 v1, s0, v2
; FLAT-NEXT: v_and_b32_e32 v0, s0, v4
; FLAT-NEXT: v_and_b32_e32 v3, s1, v2
; FLAT-NEXT: v_and_b32_e32 v2, s1, v4
; FLAT-NEXT: v_and_b32_e32 v5, s0, v6
; FLAT-NEXT: v_and_b32_e32 v4, s0, v8
; FLAT-NEXT: v_and_b32_e32 v7, s1, v6
; FLAT-NEXT: v_and_b32_e32 v6, s1, v8
; FLAT-NEXT: v_perm_b32 v6, 0, v0, s0
; FLAT-NEXT: v_perm_b32 v4, 0, v3, s0
; FLAT-NEXT: v_perm_b32 v2, 0, v2, s0
; FLAT-NEXT: v_perm_b32 v8, 0, v1, s0
; FLAT-NEXT: v_and_b32_e32 v1, s1, v2
; FLAT-NEXT: v_and_b32_e32 v0, s1, v4
; FLAT-NEXT: v_and_b32_e32 v3, s2, v2
; FLAT-NEXT: v_and_b32_e32 v2, s2, v4
; FLAT-NEXT: v_and_b32_e32 v5, s1, v6
; FLAT-NEXT: v_and_b32_e32 v4, s1, v8
; FLAT-NEXT: v_and_b32_e32 v7, s2, v6
; FLAT-NEXT: v_and_b32_e32 v6, s2, v8
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 4, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 4, v[2:3]
; FLAT-NEXT: v_lshlrev_b64 v[4:5], 4, v[4:5]
@ -712,14 +712,14 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
; FLAT-NEXT: v_or_b32_e32 v7, v7, v5
; FLAT-NEXT: v_or_b32_e32 v6, v6, v4
; FLAT-NEXT: v_and_b32_e32 v1, s2, v3
; FLAT-NEXT: v_and_b32_e32 v0, s2, v2
; FLAT-NEXT: v_and_b32_e32 v5, s2, v7
; FLAT-NEXT: v_and_b32_e32 v4, s2, v6
; FLAT-NEXT: v_and_b32_e32 v3, s3, v3
; FLAT-NEXT: v_and_b32_e32 v2, s3, v2
; FLAT-NEXT: v_and_b32_e32 v7, s3, v7
; FLAT-NEXT: v_and_b32_e32 v6, s3, v6
; FLAT-NEXT: v_and_b32_e32 v1, s3, v3
; FLAT-NEXT: v_and_b32_e32 v0, s3, v2
; FLAT-NEXT: v_and_b32_e32 v5, s3, v7
; FLAT-NEXT: v_and_b32_e32 v4, s3, v6
; FLAT-NEXT: v_and_b32_e32 v3, s8, v3
; FLAT-NEXT: v_and_b32_e32 v2, s8, v2
; FLAT-NEXT: v_and_b32_e32 v7, s8, v7
; FLAT-NEXT: v_and_b32_e32 v6, s8, v6
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 2, v[2:3]
; FLAT-NEXT: v_lshlrev_b64 v[4:5], 2, v[4:5]
@ -728,14 +728,14 @@ define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2
; FLAT-NEXT: v_or_b32_e32 v2, v2, v0
; FLAT-NEXT: v_or_b32_e32 v7, v7, v5
; FLAT-NEXT: v_or_b32_e32 v6, v6, v4
; FLAT-NEXT: v_and_b32_e32 v1, s8, v3
; FLAT-NEXT: v_and_b32_e32 v0, s8, v2
; FLAT-NEXT: v_and_b32_e32 v5, s8, v7
; FLAT-NEXT: v_and_b32_e32 v4, s8, v6
; FLAT-NEXT: v_and_b32_e32 v3, s9, v3
; FLAT-NEXT: v_and_b32_e32 v2, s9, v2
; FLAT-NEXT: v_and_b32_e32 v7, s9, v7
; FLAT-NEXT: v_and_b32_e32 v6, s9, v6
; FLAT-NEXT: v_and_b32_e32 v1, s9, v3
; FLAT-NEXT: v_and_b32_e32 v0, s9, v2
; FLAT-NEXT: v_and_b32_e32 v5, s9, v7
; FLAT-NEXT: v_and_b32_e32 v4, s9, v6
; FLAT-NEXT: v_and_b32_e32 v3, s10, v3
; FLAT-NEXT: v_and_b32_e32 v2, s10, v2
; FLAT-NEXT: v_and_b32_e32 v7, s10, v7
; FLAT-NEXT: v_and_b32_e32 v6, s10, v6
; FLAT-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
; FLAT-NEXT: v_lshrrev_b64 v[2:3], 1, v[2:3]
; FLAT-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5]

View File

@ -33,17 +33,17 @@ define amdgpu_kernel void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(
;
; VI-LABEL: test_bswap_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_load_dword s4, s[6:7], 0x0
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_perm_b32 v0, 0, s4, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: v_perm_b32 v0, 0, s0, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%val = load i32, i32 addrspace(1)* %in, align 4
%bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
@ -72,18 +72,18 @@ define amdgpu_kernel void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
;
; VI-LABEL: test_bswap_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_perm_b32 v1, 0, s5, v0
; VI-NEXT: v_perm_b32 v0, 0, s4, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: v_perm_b32 v1, 0, s3, v0
; VI-NEXT: v_perm_b32 v0, 0, s2, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
%bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
@ -123,14 +123,14 @@ define amdgpu_kernel void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_perm_b32 v3, 0, s7, v0
; VI-NEXT: v_perm_b32 v2, 0, s6, v0
; VI-NEXT: v_perm_b32 v1, 0, s5, v0
; VI-NEXT: v_perm_b32 v0, 0, s4, v0
; VI-NEXT: v_perm_b32 v3, 0, s11, v0
; VI-NEXT: v_perm_b32 v2, 0, s10, v0
; VI-NEXT: v_perm_b32 v1, 0, s9, v0
; VI-NEXT: v_perm_b32 v0, 0, s8, v0
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
%val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
@ -226,18 +226,18 @@ define amdgpu_kernel void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(
;
; VI-LABEL: test_bswap_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_mov_b32_e32 v0, 0x10203
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_perm_b32 v1, 0, s4, v0
; VI-NEXT: v_perm_b32 v0, 0, s5, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: v_perm_b32 v1, 0, s2, v0
; VI-NEXT: v_perm_b32 v0, 0, s3, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%val = load i64, i64 addrspace(1)* %in, align 8
%bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
@ -277,14 +277,14 @@ define amdgpu_kernel void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx4 s[8:11], s[6:7], 0x0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_perm_b32 v3, 0, s6, v0
; VI-NEXT: v_perm_b32 v2, 0, s7, v0
; VI-NEXT: v_perm_b32 v1, 0, s4, v0
; VI-NEXT: v_perm_b32 v0, 0, s5, v0
; VI-NEXT: v_perm_b32 v3, 0, s10, v0
; VI-NEXT: v_perm_b32 v2, 0, s11, v0
; VI-NEXT: v_perm_b32 v1, 0, s8, v0
; VI-NEXT: v_perm_b32 v0, 0, s9, v0
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: s_endpgm
%val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16

View File

@ -199,18 +199,18 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %
; GCN-NEXT: s_add_u32 s0, s0, s9
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: global_load_ushort v4, v[2:3], off
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: global_load_ushort v2, v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4
; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:6
; GCN-NEXT: global_load_ushort v2, v[0:1], off offset:4
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s7
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v4, off, s[0:3], 0 offset:4
; GCN-NEXT: global_load_ushort v4, v[2:3], off offset:2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v4, off, s[0:3], 0 offset:6
; GCN-NEXT: global_load_ushort v2, v[2:3], off offset:4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:8
; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4
; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6

View File

@ -51,24 +51,23 @@ define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x
; SI-LABEL: test_copy_v4i8_x2:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s12, s6
; SI-NEXT: s_mov_b32 s13, s7
; SI-NEXT: s_mov_b32 s14, s2
; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s8, s4
; SI-NEXT: s_mov_b32 s9, s5
; SI-NEXT: s_mov_b32 s2, s10
; SI-NEXT: s_mov_b32 s0, s6
; SI-NEXT: s_mov_b32 s1, s7
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8_x2:
@ -78,17 +77,17 @@ define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s6
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_mov_b32 s8, s6
; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
@ -106,28 +105,25 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s22, 0
; SI-NEXT: s_mov_b32 s23, s11
; SI-NEXT: s_mov_b32 s14, 0
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[20:21], s[6:7]
; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dword v0, v[0:1], s[20:23], 0 addr64
; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8_x3:
@ -144,17 +140,15 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_store_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@ -168,68 +162,70 @@ define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x
define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
; SI-LABEL: test_copy_v4i8_x4:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s14, 0
; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x11
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s20, s8
; SI-NEXT: s_mov_b32 s21, s9
; SI-NEXT: s_mov_b32 s8, s10
; SI-NEXT: s_mov_b32 s9, s11
; SI-NEXT: s_mov_b32 s16, s6
; SI-NEXT: s_mov_b32 s17, s7
; SI-NEXT: s_mov_b32 s18, s2
; SI-NEXT: s_mov_b32 s19, s3
; SI-NEXT: s_mov_b32 s22, s2
; SI-NEXT: s_mov_b32 s23, s3
; SI-NEXT: s_mov_b32 s10, s2
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s22, s10
; SI-NEXT: s_mov_b32 s23, s11
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s20, s6
; SI-NEXT: s_mov_b32 s21, s7
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_store_dword v0, off, s[16:19], 0
; SI-NEXT: buffer_store_dword v0, off, s[20:23], 0
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8_x4:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x44
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s16, s8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s17, s9
; VI-NEXT: s_mov_b32 s8, s10
; VI-NEXT: s_mov_b32 s9, s11
; VI-NEXT: s_mov_b32 s12, s6
; VI-NEXT: s_mov_b32 s13, s7
; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_mov_b32 s18, s2
; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s22, s10
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_mov_b32 s23, s11
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s20, s6
; VI-NEXT: s_mov_b32 s21, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_store_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: buffer_store_dword v0, off, s[20:23], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@ -245,23 +241,22 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0
; SI-LABEL: test_copy_v4i8_extra_use:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_mov_b32 s12, 0xff00
; SI-NEXT: s_movk_i32 s13, 0xff
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s8, s4
; SI-NEXT: s_mov_b32 s9, s5
; SI-NEXT: s_mov_b32 s2, s10
; SI-NEXT: s_mov_b32 s0, s6
; SI-NEXT: s_mov_b32 s1, s7
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
@ -277,47 +272,47 @@ define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8_extra_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_movk_i32 s10, 0x900
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_movk_i32 s12, 0xff00
; VI-NEXT: s_movk_i32 s13, 0xff
; VI-NEXT: s_movk_i32 s14, 0x900
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_add_u32_e32 v0, vcc, s8, v0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_movk_i32 s8, 0xff00
; VI-NEXT: s_movk_i32 s9, 0xff
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_mov_b32 s8, s6
; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_and_b32_e32 v4, s8, v1
; VI-NEXT: v_and_b32_e32 v4, s12, v1
; VI-NEXT: v_add_u16_e32 v1, 9, v1
; VI-NEXT: v_add_u16_e32 v3, 9, v0
; VI-NEXT: v_and_b32_e32 v1, s9, v1
; VI-NEXT: v_and_b32_e32 v1, s13, v1
; VI-NEXT: v_or_b32_e32 v1, v4, v1
; VI-NEXT: v_and_b32_e32 v2, s8, v0
; VI-NEXT: v_and_b32_e32 v3, s9, v3
; VI-NEXT: v_and_b32_e32 v2, s12, v0
; VI-NEXT: v_and_b32_e32 v3, s13, v3
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: v_add_u16_e32 v1, s10, v1
; VI-NEXT: v_add_u16_e32 v2, s10, v2
; VI-NEXT: v_add_u16_e32 v1, s14, v1
; VI-NEXT: v_add_u16_e32 v2, s14, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@ -334,35 +329,32 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s18, 0
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s14, 0
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[16:17], s[6:7]
; SI-NEXT: s_mov_b64 s[12:13], s[6:7]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dword v0, v[0:1], s[16:19], 0 addr64
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
; SI-NEXT: s_mov_b32 s4, 0xff00
; SI-NEXT: s_movk_i32 s5, 0xff
; SI-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
; SI-NEXT: s_mov_b32 s16, 0xff00
; SI-NEXT: s_movk_i32 s17, 0xff
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s0, s2
; SI-NEXT: s_mov_b32 s1, s3
; SI-NEXT: s_mov_b32 s2, s10
; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v3, vcc, 9, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-NEXT: v_and_b32_e32 v4, s4, v1
; SI-NEXT: v_and_b32_e32 v4, s16, v1
; SI-NEXT: v_add_i32_e32 v1, vcc, 9, v1
; SI-NEXT: v_and_b32_e32 v2, s4, v0
; SI-NEXT: v_and_b32_e32 v3, s5, v3
; SI-NEXT: v_and_b32_e32 v2, s16, v0
; SI-NEXT: v_and_b32_e32 v3, s17, v3
; SI-NEXT: v_or_b32_e32 v2, v2, v3
; SI-NEXT: v_and_b32_e32 v1, s5, v1
; SI-NEXT: v_and_b32_e32 v1, s17, v1
; SI-NEXT: v_add_i32_e32 v2, vcc, 0x900, v2
; SI-NEXT: v_or_b32_e32 v1, v4, v1
; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
@ -370,51 +362,49 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_add_i32_e32 v1, vcc, 0x9000000, v1
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_store_dword v1, off, s[12:15], 0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_copy_v4i8_x2_extra_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_movk_i32 s16, 0xff00
; VI-NEXT: s_movk_i32 s17, 0xff
; VI-NEXT: s_movk_i32 s18, 0x900
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_movk_i32 s4, 0xff00
; VI-NEXT: s_mov_b32 s13, s5
; VI-NEXT: s_movk_i32 s5, 0xff
; VI-NEXT: s_movk_i32 s6, 0x900
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_mov_b32 s0, s2
; VI-NEXT: s_mov_b32 s1, s3
; VI-NEXT: s_mov_b32 s2, s10
; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_and_b32_e32 v4, s4, v1
; VI-NEXT: v_and_b32_e32 v4, s16, v1
; VI-NEXT: v_add_u16_e32 v1, 9, v1
; VI-NEXT: v_add_u16_e32 v3, 9, v0
; VI-NEXT: v_and_b32_e32 v1, s5, v1
; VI-NEXT: v_and_b32_e32 v1, s17, v1
; VI-NEXT: v_or_b32_e32 v1, v4, v1
; VI-NEXT: v_and_b32_e32 v2, s4, v0
; VI-NEXT: v_and_b32_e32 v3, s5, v3
; VI-NEXT: v_and_b32_e32 v2, s16, v0
; VI-NEXT: v_and_b32_e32 v3, s17, v3
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: v_add_u16_e32 v1, s6, v1
; VI-NEXT: v_add_u16_e32 v2, s6, v2
; VI-NEXT: v_add_u16_e32 v1, s18, v1
; VI-NEXT: v_add_u16_e32 v2, s18, v2
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v1, v2, v1
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_store_dword v1, off, s[12:15], 0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@ -429,18 +419,18 @@ define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %o
define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
; SI-LABEL: test_copy_v3i8_align4:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s10, 0
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[8:9], s[6:7]
; SI-NEXT: s_mov_b64 s[4:5], s[10:11]
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_mov_b32 s1, s9
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@ -449,17 +439,15 @@ define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3
;
; VI-LABEL: test_copy_v3i8_align4:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0

View File

@ -526,27 +526,27 @@ define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrsp
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; VI-NEXT: v_mov_b32_e32 v5, 0
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; VI-NEXT: v_mov_b32_e32 v4, 0
; VI-NEXT: v_mov_b32_e32 v2, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v6, s3
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, v6, v5, vcc
; VI-NEXT: v_mov_b32_e32 v5, s3
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v3
; VI-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v3
; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v0, v2
; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; VI-NEXT: v_ffbh_u32_e32 v6, v3
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; VI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: v_ffbh_u32_e32 v5, v0
; VI-NEXT: v_add_u32_e32 v5, vcc, 32, v5
; VI-NEXT: v_ffbh_u32_e32 v6, v1
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v1, 64, v1, vcc
; VI-NEXT: flat_store_dwordx2 v[3:4], v[1:2]
; VI-NEXT: s_endpgm
;
; EG-LABEL: v_ctlz_i64:
@ -621,18 +621,18 @@ define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_add_u32_e32 v1, vcc, s0, v1
; VI-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
; VI-NEXT: flat_load_dwordx2 v[1:2], v[1:2]
; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0
; VI-NEXT: flat_load_dwordx2 v[0:1], v[1:2]
; VI-NEXT: v_addc_u32_e32 v4, vcc, v5, v4, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_ffbh_u32_e32 v2, v0
; VI-NEXT: v_add_u32_e32 v2, vcc, 32, v2
; VI-NEXT: v_ffbh_u32_e32 v5, v1
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; VI-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v0, 64, v1, vcc
; VI-NEXT: v_ffbh_u32_e32 v0, v1
; VI-NEXT: v_add_u32_e32 v0, vcc, 32, v0
; VI-NEXT: v_ffbh_u32_e32 v5, v2
; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; VI-NEXT: v_cndmask_b32_e32 v0, 64, v0, vcc
; VI-NEXT: flat_store_dword v[3:4], v0
; VI-NEXT: s_endpgm
;

View File

@ -177,11 +177,11 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* n
; SI-NOSDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]]
; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]]
; SI-SDWA: v_or_b32_e32
; SI-SDWA: v_or_b32_sdwa
; SI-SDWA: v_or_b32_e32
; SI-SDWA: v_or_b32_e32
; SI-SDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]]
; SI-SDWA: v_or_b32_e32
; SI-SDWA: v_or_b32_sdwa
; SI-SDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]]

View File

@ -636,20 +636,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
; SI-LABEL: load_v4i8_to_v4f32_2_uses:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xb
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s2, 0
; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s6, 0
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_movk_i32 s12, 0xff
; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_movk_i32 s8, 0xff
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; SI-NEXT: v_lshrrev_b32_e32 v6, 24, v4
@ -659,57 +658,58 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
; SI-NEXT: v_add_i32_e32 v4, vcc, 9, v4
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_and_b32_e32 v0, s12, v4
; SI-NEXT: v_and_b32_e32 v0, s8, v4
; SI-NEXT: v_add_i32_e32 v2, vcc, 9, v5
; SI-NEXT: v_or_b32_e32 v0, v7, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v6
; SI-NEXT: v_and_b32_e32 v2, s12, v2
; SI-NEXT: v_and_b32_e32 v2, s8, v2
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x900, v0
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x9000000, v0
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: load_v4i8_to_v4f32_2_uses:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: v_mov_b32_e32 v5, 9
; VI-NEXT: s_movk_i32 s8, 0x900
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: v_mov_b32_e32 v4, 9
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: s_movk_i32 s0, 0x900
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v5
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v5
; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v5
; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v5
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v5
; VI-NEXT: v_lshrrev_b32_e32 v6, 24, v4
; VI-NEXT: v_cvt_f32_ubyte3_e32 v3, v4
; VI-NEXT: v_cvt_f32_ubyte2_e32 v2, v4
; VI-NEXT: v_cvt_f32_ubyte1_e32 v1, v4
; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v5
; VI-NEXT: v_add_u16_e32 v8, 9, v5
; VI-NEXT: v_add_u16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_and_b32_e32 v7, 0xffffff00, v4
; VI-NEXT: v_add_u16_e32 v8, 9, v4
; VI-NEXT: v_add_u16_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v6
; VI-NEXT: v_or_b32_sdwa v0, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_add_u16_e32 v0, s0, v0
; VI-NEXT: v_mov_b32_e32 v2, s8
; VI-NEXT: v_add_u16_e32 v0, s8, v0
; VI-NEXT: v_add_u16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid.x = call i32 @llvm.amdgcn.workitem.id.x()
%in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
@ -733,29 +733,30 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; SI-NEXT: v_mov_b32_e32 v1, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:5
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:6
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64
; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1
; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2
; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:3
; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:4
; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:3
; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:4
; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:5
; SI-NEXT: buffer_load_ubyte v8, v[0:1], s[0:3], 0 addr64 offset:6
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt vmcnt(4)
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v4
; SI-NEXT: s_waitcnt vmcnt(6)
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v2
; SI-NEXT: s_waitcnt vmcnt(5)
; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v3
; SI-NEXT: s_waitcnt vmcnt(3)
; SI-NEXT: v_cvt_f32_ubyte2_e32 v1, v5
; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v2
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshlrev_b32_e32 v7, 8, v7
; SI-NEXT: v_or_b32_e32 v2, v7, v6
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v8
; SI-NEXT: v_cvt_f32_ubyte0_e32 v8, v3
; SI-NEXT: v_lshlrev_b32_e32 v9, 8, v4
; SI-NEXT: v_or_b32_e32 v2, v9, v6
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v7
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_ubyte0_e32 v7, v8
; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v2
; SI-NEXT: v_cvt_f32_ubyte2_e32 v5, v5
; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v2
; SI-NEXT: buffer_store_dword v8, off, s[4:7], 0 offset:24
; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:24
; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; SI-NEXT: s_endpgm

View File

@ -609,10 +609,10 @@ entry:
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI-DAG: v_cmp_nlt_f32_e32 vcc, v[[A_F32_1]], v[[B_F32_1]]
; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[A_V2_F16]], v[[B_V2_F16]]
; VI-DAG: v_cmp_nlt_f16_e32 vcc, v[[B_V2_F16]], v[[A_V2_F16]]
; GCN: v_cndmask_b32_e64 v[[R_I32_0:[0-9]+]]
; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16_1]], v[[B_F16_1]]
; VI: v_cmp_nlt_f16_e32 vcc, v[[B_F16_1]], v[[A_F16_1]]
; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
; GCN: s_endpgm

View File

@ -191,7 +191,7 @@ define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %
; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
; GCN-SAFE-DAG: v_sub_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
; GCN-NSZ-DAG: v_sub_f32_e32 [[NEG_ADD:v[0-9]+]], [[A]], [[B]]
; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
@ -1343,9 +1343,9 @@ define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)*
; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]]
; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
; GCN-SAFE-DAG: v_fma_f32 [[FMA:v[0-9]+]]
; GCN-SAFE-DAG: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
; GCN-NSZ-NEXT: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_FMA]]

View File

@ -289,35 +289,35 @@ bb3: ; preds = %bb3, %bb
define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
; GFX9-LABEL: urem16_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_mov_b32 s4, 0xffff
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_movk_i32 s6, 0x400
; GFX9-NEXT: s_movk_i32 s8, 0x400
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s3, s2, s3
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
; GFX9-NEXT: s_and_b32 s5, s4, s2
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX9-NEXT: BB5_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_and_b32_e32 v2, s2, v4
; GFX9-NEXT: v_and_b32_e32 v2, s4, v4
; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v2
; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v7, s5
; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
; GFX9-NEXT: v_mul_f32_e32 v7, v8, v1
; GFX9-NEXT: v_trunc_f32_e32 v7, v7
; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v7
; GFX9-NEXT: v_mad_f32 v7, -v7, v0, v8
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, v0
; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v9, s[0:1]
; GFX9-NEXT: v_mul_lo_u32 v7, v7, s3
; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s6, v4
; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s8, v4
; GFX9-NEXT: v_mul_f32_e32 v9, v8, v1
; GFX9-NEXT: v_trunc_f32_e32 v9, v9
; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v9
; GFX9-NEXT: v_mad_f32 v8, -v9, v0, v8
; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v8|, v0
; GFX9-NEXT: v_mov_b32_e32 v7, s7
; GFX9-NEXT: v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3]
; GFX9-NEXT: v_mul_lo_u32 v8, v8, s5
; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s6, v5
; GFX9-NEXT: s_and_b64 vcc, exec, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7
; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
; GFX9-NEXT: v_sub_u32_e32 v2, v2, v8
; GFX9-NEXT: global_store_short v[5:6], v2, off
; GFX9-NEXT: s_cbranch_vccz BB5_1
; GFX9-NEXT: ; %bb.2: ; %bb2
@ -398,38 +398,38 @@ define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %a
; GFX9-LABEL: srem16_invariant_denom:
; GFX9: ; %bb.0: ; %bb
; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_movk_i32 s3, 0x400
; GFX9-NEXT: s_movk_i32 s5, 0x400
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sext_i32_i16 s2, s2
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
; GFX9-NEXT: s_sext_i32_i16 s4, s2
; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s4
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GFX9-NEXT: BB7_1: ; %bb3
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_bfe_i32 v7, v4, 0, 16
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4
; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v7
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v4
; GFX9-NEXT: v_xor_b32_e32 v9, s4, v7
; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
; GFX9-NEXT: v_mov_b32_e32 v8, s5
; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1]
; GFX9-NEXT: v_mul_f32_e32 v8, v10, v1
; GFX9-NEXT: v_xor_b32_e32 v9, s2, v7
; GFX9-NEXT: v_trunc_f32_e32 v8, v8
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v9
; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v8
; GFX9-NEXT: v_mad_f32 v8, -v8, v0, v10
; GFX9-NEXT: v_mul_f32_e32 v9, v10, v1
; GFX9-NEXT: v_trunc_f32_e32 v9, v9
; GFX9-NEXT: v_cvt_i32_f32_e32 v11, v9
; GFX9-NEXT: v_mad_f32 v9, -v9, v0, v10
; GFX9-NEXT: v_or_b32_e32 v2, 1, v2
; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, |v0|
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[0:1]
; GFX9-NEXT: v_add_u32_e32 v2, v9, v2
; GFX9-NEXT: v_mul_lo_u32 v2, v2, s2
; GFX9-NEXT: v_cmp_ge_f32_e64 s[2:3], |v9|, |v0|
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[2:3]
; GFX9-NEXT: v_add_u32_e32 v2, v11, v2
; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4
; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4
; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s5, v4
; GFX9-NEXT: v_mov_b32_e32 v8, s7
; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s6, v5
; GFX9-NEXT: s_and_b64 vcc, exec, vcc
; GFX9-NEXT: v_sub_u32_e32 v2, v7, v2
; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1]
; GFX9-NEXT: global_store_short v[5:6], v2, off
; GFX9-NEXT: s_cbranch_vccz BB7_1
; GFX9-NEXT: ; %bb.2: ; %bb2

View File

@ -2591,20 +2591,20 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_mov_b32 s0, 0xffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s3, s1, s0
; GFX8-NEXT: s_lshr_b32 s1, s1, 16
; GFX8-NEXT: s_and_b32 s0, s2, s0
; GFX8-NEXT: s_and_b32 s3, s2, s0
; GFX8-NEXT: s_lshr_b32 s2, s2, 16
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: s_and_b32 s0, s1, s0
; GFX8-NEXT: s_lshr_b32 s1, s1, 16
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
@ -2615,20 +2615,20 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16
; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
@ -2728,19 +2728,19 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s6
; GFX8-NEXT: v_mov_b32_e32 v3, s7
; GFX8-NEXT: flat_load_ushort v2, v[2:3]
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
; GFX8-NEXT: flat_load_ushort v1, v[2:3]
; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX8-NEXT: s_waitcnt vmcnt(1) lgkmcnt(0)
; GFX8-NEXT: v_bfe_i32 v3, v2, 0, 8
; GFX8-NEXT: v_lshrrev_b16_e32 v2, 8, v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v1, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v3, v1, 0, 8
; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX8-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX8-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX8-NEXT: v_mad_i32_i24 v0, v2, v0, s2
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v1, v0
; GFX8-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX8-NEXT: v_mad_i32_i24 v0, v1, v0, s2
; GFX8-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dword v[0:1], v2
@ -2755,20 +2755,20 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NODL-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NODL-NEXT: global_load_ushort v2, v[2:3], off
; GFX9-NODL-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NODL-NEXT: global_load_ushort v1, v[2:3], off
; GFX9-NODL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
; GFX9-NODL-NEXT: v_bfe_i32 v3, v2, 0, 8
; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v2, 8, v2
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_bfe_i32 v1, v0, 0, 8
; GFX9-NODL-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v0, 8, v0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 8
; GFX9-NODL-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX9-NODL-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX9-NODL-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v2, v0, s2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v1, v0
; GFX9-NODL-NEXT: v_mad_i32_i24 v0, v1, v0, s2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_store_dword v[0:1], v2, off
@ -2783,20 +2783,20 @@ define amdgpu_kernel void @notsdot2_sext8(<2 x i8> addrspace(1)* %src1,
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s5
; GFX9-DL-NEXT: v_mov_b32_e32 v2, s6
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s7
; GFX9-DL-NEXT: global_load_ushort v2, v[2:3], off
; GFX9-DL-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-DL-NEXT: global_load_ushort v1, v[2:3], off
; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_bfe_i32 v3, v2, 0, 8
; GFX9-DL-NEXT: v_lshrrev_b16_e32 v2, 8, v2
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_bfe_i32 v1, v0, 0, 8
; GFX9-DL-NEXT: v_bfe_i32 v2, v0, 0, 8
; GFX9-DL-NEXT: v_lshrrev_b16_e32 v0, 8, v0
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 8
; GFX9-DL-NEXT: v_lshrrev_b16_e32 v1, 8, v1
; GFX9-DL-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX9-DL-NEXT: v_bfe_i32 v2, v2, 0, 8
; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mad_i32_i24 v0, v2, v0, s2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v1, v0
; GFX9-DL-NEXT: v_mad_i32_i24 v0, v1, v0, s2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, v3, v2, v0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off

View File

@ -221,28 +221,28 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_sext_i32_i8 s3, s2
; GFX8-NEXT: s_bfe_i32 s5, s2, 0x80008
; GFX8-NEXT: s_sext_i32_i8 s2, s0
; GFX8-NEXT: s_sext_i32_i8 s3, s1
; GFX8-NEXT: s_bfe_i32 s5, s1, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_bfe_i32 s7, s2, 0x80010
; GFX8-NEXT: s_sext_i32_i8 s1, s0
; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80010
; GFX8-NEXT: s_bfe_i32 s4, s0, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_bfe_i32 s6, s0, 0x80010
; GFX8-NEXT: s_ashr_i32 s2, s2, 24
; GFX8-NEXT: s_ashr_i32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_ashr_i32 s0, s0, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s4, v4, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s6, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
@ -252,28 +252,28 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2
; GFX9-NODL-NEXT: s_bfe_i32 s5, s2, 0x80008
; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s0
; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1
; GFX9-NODL-NEXT: s_bfe_i32 s5, s1, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NODL-NEXT: s_bfe_i32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0
; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010
; GFX9-NODL-NEXT: s_bfe_i32 s4, s0, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NODL-NEXT: s_bfe_i32 s6, s0, 0x80010
; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24
; GFX9-NODL-NEXT: s_ashr_i32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7
; GFX9-NODL-NEXT: s_ashr_i32 s0, s0, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v4, v2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s6, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
@ -357,28 +357,28 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_movk_i32 s8, 0xff
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX7-NEXT: s_movk_i32 s5, 0xff
; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_and_b32 s7, s6, s5
; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008
; GFX7-NEXT: s_and_b32 s5, s4, s5
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010
; GFX7-NEXT: s_and_b32 s7, s4, s8
; GFX7-NEXT: s_and_b32 s6, s5, s8
; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v1, s6
; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010
; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v2, s8
; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
; GFX7-NEXT: s_lshr_b32 s6, s6, 24
; GFX7-NEXT: s_lshr_b32 s5, s5, 24
; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: s_lshr_b32 s4, s4, 24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
; GFX7-NEXT: v_mov_b32_e32 v1, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@ -388,30 +388,30 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX8-NEXT: s_and_b32 s3, s1, s0
; GFX8-NEXT: s_and_b32 s0, s2, s0
; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
; GFX8-NEXT: s_and_b32 s3, s2, s0
; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008
; GFX8-NEXT: s_and_b32 s0, s1, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010
; GFX8-NEXT: s_lshr_b32 s2, s2, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@ -420,30 +420,30 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;

View File

@ -184,28 +184,28 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_movk_i32 s8, 0xff
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX7-NEXT: s_movk_i32 s5, 0xff
; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_and_b32 s7, s6, s5
; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008
; GFX7-NEXT: s_and_b32 s5, s4, s5
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010
; GFX7-NEXT: s_and_b32 s7, s4, s8
; GFX7-NEXT: s_and_b32 s6, s5, s8
; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v1, s6
; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010
; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v2, s8
; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
; GFX7-NEXT: s_lshr_b32 s6, s6, 24
; GFX7-NEXT: s_lshr_b32 s5, s5, 24
; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: s_lshr_b32 s4, s4, 24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
; GFX7-NEXT: v_mov_b32_e32 v1, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@ -215,20 +215,20 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_and_b32 s3, s1, s0
; GFX8-NEXT: s_and_b32 s0, s2, s0
; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v3, s0
; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
; GFX8-NEXT: s_lshr_b32 s2, s2, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s7
@ -247,20 +247,20 @@ define amdgpu_kernel void @udot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7
@ -354,28 +354,28 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_movk_i32 s8, 0xff
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX7-NEXT: s_movk_i32 s5, 0xff
; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_and_b32 s7, s6, s5
; GFX7-NEXT: s_bfe_u32 s8, s6, 0x80008
; GFX7-NEXT: s_and_b32 s5, s4, s5
; GFX7-NEXT: v_mov_b32_e32 v1, s7
; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80010
; GFX7-NEXT: s_and_b32 s7, s4, s8
; GFX7-NEXT: s_and_b32 s6, s5, s8
; GFX7-NEXT: s_bfe_u32 s8, s5, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v1, s6
; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80010
; GFX7-NEXT: s_bfe_u32 s9, s4, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v2, s8
; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
; GFX7-NEXT: s_lshr_b32 s6, s6, 24
; GFX7-NEXT: s_lshr_b32 s5, s5, 24
; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: s_lshr_b32 s4, s4, 24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s7, v1, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
; GFX7-NEXT: v_mov_b32_e32 v1, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@ -385,30 +385,30 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX8-NEXT: s_and_b32 s3, s1, s0
; GFX8-NEXT: s_and_b32 s0, s2, s0
; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
; GFX8-NEXT: s_and_b32 s3, s2, s0
; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008
; GFX8-NEXT: s_and_b32 s0, s1, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
; GFX8-NEXT: s_bfe_u32 s6, s2, 0x80010
; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010
; GFX8-NEXT: s_lshr_b32 s2, s2, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@ -417,30 +417,30 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
; GFX9-NODL-NEXT: s_bfe_u32 s6, s2, 0x80010
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
@ -1426,28 +1426,28 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s8, 0xffff
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_sext_i32_i8 s7, s6
; GFX7-NEXT: s_bfe_u32 s9, s6, 0x80008
; GFX7-NEXT: s_sext_i32_i8 s5, s4
; GFX7-NEXT: s_sext_i32_i8 s6, s4
; GFX7-NEXT: s_sext_i32_i8 s7, s5
; GFX7-NEXT: s_bfe_u32 s9, s5, 0x80008
; GFX7-NEXT: s_and_b32 s7, s7, s8
; GFX7-NEXT: s_bfe_u32 s10, s4, 0x80008
; GFX7-NEXT: v_mov_b32_e32 v1, s9
; GFX7-NEXT: s_bfe_u32 s11, s6, 0x80010
; GFX7-NEXT: s_and_b32 s5, s5, s8
; GFX7-NEXT: s_bfe_u32 s11, s5, 0x80010
; GFX7-NEXT: s_and_b32 s6, s6, s8
; GFX7-NEXT: v_mov_b32_e32 v3, s7
; GFX7-NEXT: s_bfe_u32 s12, s4, 0x80010
; GFX7-NEXT: s_lshr_b32 s6, s6, 24
; GFX7-NEXT: s_lshr_b32 s5, s5, 24
; GFX7-NEXT: v_mov_b32_e32 v2, s11
; GFX7-NEXT: s_lshr_b32 s4, s4, 24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s5, v3, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s6, v3, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0
; GFX7-NEXT: v_mov_b32_e32 v1, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
@ -1457,28 +1457,28 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX8-NEXT: s_sext_i32_i8 s3, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008
; GFX8-NEXT: s_sext_i32_i8 s1, s0
; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
; GFX8-NEXT: s_sext_i32_i8 s3, s1
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: s_bfe_u32 s7, s1, 0x80010
; GFX8-NEXT: s_sext_i32_i8 s2, s0
; GFX8-NEXT: v_mov_b32_e32 v4, s3
; GFX8-NEXT: s_bfe_u32 s6, s0, 0x80010
; GFX8-NEXT: s_lshr_b32 s2, s2, 24
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s7
; GFX8-NEXT: s_lshr_b32 s0, s0, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s4, v3, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s2, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s6, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
@ -1488,28 +1488,28 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008
; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s0
; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NODL-NEXT: s_bfe_u32 s7, s1, 0x80010
; GFX9-NODL-NEXT: s_sext_i32_i8 s2, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NODL-NEXT: s_bfe_u32 s6, s0, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s7
; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v3, v2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s6, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
@ -1519,28 +1519,28 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-DL-NEXT: s_sext_i32_i8 s3, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x80008
; GFX9-DL-NEXT: s_sext_i32_i8 s1, s0
; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x80008
; GFX9-DL-NEXT: s_sext_i32_i8 s3, s1
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s5
; GFX9-DL-NEXT: s_bfe_u32 s7, s1, 0x80010
; GFX9-DL-NEXT: s_sext_i32_i8 s2, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x80010
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7
; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v3, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v4, v2
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v5, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
@ -1809,29 +1809,29 @@ define amdgpu_kernel void @udot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_movk_i32 s8, 0xff
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX7-NEXT: s_movk_i32 s7, 0xff
; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_bfe_u32 s10, s6, 0x80008
; GFX7-NEXT: s_bfe_u32 s12, s6, 0x80010
; GFX7-NEXT: s_lshr_b32 s9, s6, 24
; GFX7-NEXT: s_and_b32 s6, s6, s7
; GFX7-NEXT: s_lshr_b32 s5, s4, 24
; GFX7-NEXT: s_bfe_u32 s8, s4, 0x80008
; GFX7-NEXT: s_lshr_b32 s6, s4, 24
; GFX7-NEXT: s_bfe_u32 s10, s5, 0x80008
; GFX7-NEXT: s_bfe_u32 s12, s5, 0x80010
; GFX7-NEXT: s_lshr_b32 s9, s5, 24
; GFX7-NEXT: s_and_b32 s5, s5, s8
; GFX7-NEXT: s_bfe_u32 s7, s4, 0x80008
; GFX7-NEXT: s_bfe_u32 s11, s4, 0x80010
; GFX7-NEXT: s_and_b32 s4, s4, s7
; GFX7-NEXT: v_mov_b32_e32 v1, s6
; GFX7-NEXT: s_and_b32 s4, s4, s8
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mov_b32_e32 v2, s10
; GFX7-NEXT: v_mov_b32_e32 v3, s12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s8, v2, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s7, v2, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0
; GFX7-NEXT: v_mov_b32_e32 v1, s9
; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0
; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;

View File

@ -657,43 +657,43 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_movk_i32 s2, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s6, s3, 0x40000
; GFX8-NEXT: s_lshr_b32 s4, s3, 12
; GFX8-NEXT: s_bfe_i32 s8, s3, 0x40004
; GFX8-NEXT: s_bfe_i32 s10, s3, 0x40008
; GFX8-NEXT: s_lshr_b32 s1, s0, 12
; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000
; GFX8-NEXT: s_lshr_b32 s3, s1, 12
; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40000
; GFX8-NEXT: s_lshr_b32 s4, s2, 12
; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004
; GFX8-NEXT: s_bfe_i32 s10, s2, 0x40008
; GFX8-NEXT: s_bfe_i32 s5, s1, 0x40000
; GFX8-NEXT: v_mov_b32_e32 v6, s6
; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1
; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s3
; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4
; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40004
; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008
; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40004
; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40008
; GFX8-NEXT: v_mov_b32_e32 v3, s10
; GFX8-NEXT: v_mov_b32_e32 v7, s8
; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
; GFX8-NEXT: v_mul_i32_i24_e32 v3, s9, v3
; GFX8-NEXT: s_bfe_i32 s12, s3, 0x40010
; GFX8-NEXT: v_and_b32_e32 v4, s2, v4
; GFX8-NEXT: v_and_b32_e32 v5, s2, v5
; GFX8-NEXT: s_bfe_i32 s14, s3, 0x40014
; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40010
; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010
; GFX8-NEXT: v_and_b32_e32 v4, s0, v4
; GFX8-NEXT: v_and_b32_e32 v5, s0, v5
; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014
; GFX8-NEXT: s_bfe_i32 s11, s1, 0x40010
; GFX8-NEXT: v_mov_b32_e32 v8, s12
; GFX8-NEXT: s_bfe_i32 s16, s3, 0x40018
; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40014
; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018
; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40014
; GFX8-NEXT: v_mov_b32_e32 v9, s14
; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40018
; GFX8-NEXT: s_ashr_i32 s3, s3, 28
; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40018
; GFX8-NEXT: s_ashr_i32 s2, s2, 28
; GFX8-NEXT: v_mov_b32_e32 v10, s16
; GFX8-NEXT: s_ashr_i32 s0, s0, 28
; GFX8-NEXT: s_ashr_i32 s1, s1, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s7, v7, v2
@ -702,8 +702,8 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2
; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s2
; GFX8-NEXT: v_mad_i32_i24 v2, s1, v3, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@ -711,43 +711,43 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_movk_i32 s2, 0xff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NEXT: s_movk_i32 s0, 0xff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_i32 s6, s3, 0x40000
; GFX9-NEXT: s_lshr_b32 s4, s3, 12
; GFX9-NEXT: s_bfe_i32 s8, s3, 0x40004
; GFX9-NEXT: s_bfe_i32 s10, s3, 0x40008
; GFX9-NEXT: s_lshr_b32 s1, s0, 12
; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000
; GFX9-NEXT: s_lshr_b32 s3, s1, 12
; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000
; GFX9-NEXT: s_lshr_b32 s4, s2, 12
; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004
; GFX9-NEXT: s_bfe_i32 s10, s2, 0x40008
; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000
; GFX9-NEXT: v_mov_b32_e32 v6, s6
; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1
; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s3
; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4
; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40004
; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008
; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004
; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008
; GFX9-NEXT: v_mov_b32_e32 v3, s10
; GFX9-NEXT: v_mov_b32_e32 v7, s8
; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3
; GFX9-NEXT: s_bfe_i32 s12, s3, 0x40010
; GFX9-NEXT: v_and_b32_e32 v4, s2, v4
; GFX9-NEXT: v_and_b32_e32 v5, s2, v5
; GFX9-NEXT: s_bfe_i32 s14, s3, 0x40014
; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40010
; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010
; GFX9-NEXT: v_and_b32_e32 v4, s0, v4
; GFX9-NEXT: v_and_b32_e32 v5, s0, v5
; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014
; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40010
; GFX9-NEXT: v_mov_b32_e32 v8, s12
; GFX9-NEXT: s_bfe_i32 s16, s3, 0x40018
; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40014
; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018
; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40014
; GFX9-NEXT: v_mov_b32_e32 v9, s14
; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40018
; GFX9-NEXT: s_ashr_i32 s3, s3, 28
; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40018
; GFX9-NEXT: s_ashr_i32 s2, s2, 28
; GFX9-NEXT: v_mov_b32_e32 v10, s16
; GFX9-NEXT: s_ashr_i32 s0, s0, 28
; GFX9-NEXT: s_ashr_i32 s1, s1, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2
; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2
@ -756,8 +756,8 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2
; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2
; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_mad_i32_i24 v2, s1, v3, v2
; GFX9-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
@ -765,43 +765,43 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x40000
; GFX9-DL-NEXT: s_lshr_b32 s4, s3, 12
; GFX9-DL-NEXT: s_bfe_i32 s8, s3, 0x40004
; GFX9-DL-NEXT: s_bfe_i32 s10, s3, 0x40008
; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12
; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000
; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 12
; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000
; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 12
; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004
; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x40008
; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40000
; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4
; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40004
; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008
; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40004
; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40008
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10
; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3
; GFX9-DL-NEXT: s_bfe_i32 s12, s3, 0x40010
; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4
; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5
; GFX9-DL-NEXT: s_bfe_i32 s14, s3, 0x40014
; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40010
; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010
; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4
; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v5
; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014
; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40010
; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12
; GFX9-DL-NEXT: s_bfe_i32 s16, s3, 0x40018
; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40014
; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018
; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40014
; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14
; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40018
; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28
; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40018
; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28
; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16
; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28
; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2
@ -810,8 +810,8 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v3, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
;
@ -1462,19 +1462,19 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s8, 0xffff
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_bfe_i32 s15, s6, 0x40018
; GFX7-NEXT: s_bfe_i32 s16, s6, 0x40014
; GFX7-NEXT: s_bfe_i32 s17, s6, 0x40010
; GFX7-NEXT: s_bfe_i32 s18, s6, 0x40000
; GFX7-NEXT: s_bfe_i32 s19, s6, 0x40004
; GFX7-NEXT: s_bfe_i32 s20, s6, 0x40008
; GFX7-NEXT: s_ashr_i32 s14, s6, 28
; GFX7-NEXT: s_bfe_i32 s6, s6, 0x4000c
; GFX7-NEXT: s_ashr_i32 s5, s4, 28
; GFX7-NEXT: s_ashr_i32 s6, s4, 28
; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40018
; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40014
; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40010
; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40000
; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40004
; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40008
; GFX7-NEXT: s_ashr_i32 s14, s5, 28
; GFX7-NEXT: s_bfe_i32 s5, s5, 0x4000c
; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40018
; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40014
; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40010
@ -1485,7 +1485,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40008
; GFX7-NEXT: v_mov_b32_e32 v2, s20
; GFX7-NEXT: s_bfe_i32 s4, s4, 0x4000c
; GFX7-NEXT: v_mov_b32_e32 v1, s6
; GFX7-NEXT: v_mov_b32_e32 v1, s5
; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1
; GFX7-NEXT: v_mul_i32_i24_e32 v2, s13, v2
; GFX7-NEXT: v_mul_i32_i24_e32 v3, s12, v3
@ -1510,7 +1510,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: v_mad_i32_i24 v0, s9, v6, v0
; GFX7-NEXT: v_mad_i32_i24 v0, s7, v7, v0
; GFX7-NEXT: v_mov_b32_e32 v1, s14
; GFX7-NEXT: v_mad_i32_i24 v0, s5, v1, v0
; GFX7-NEXT: v_mad_i32_i24 v0, s6, v1, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
@ -1954,24 +1954,24 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NEXT: s_mov_b32 s0, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s7, s0, 4
; GFX9-NEXT: s_lshr_b32 s14, s1, 4
; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s0
; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1
; GFX9-NEXT: s_lshr_b32 s7, s1, 4
; GFX9-NEXT: s_lshr_b32 s14, s2, 4
; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s1
; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2
; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7
; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s14
; GFX9-NEXT: s_lshr_b32 s8, s0, 12
; GFX9-NEXT: s_lshr_b32 s9, s0, 8
; GFX9-NEXT: s_lshr_b32 s15, s1, 12
; GFX9-NEXT: s_lshr_b32 s16, s1, 8
; GFX9-NEXT: s_lshr_b32 s8, s1, 12
; GFX9-NEXT: s_lshr_b32 s9, s1, 8
; GFX9-NEXT: s_lshr_b32 s15, s2, 12
; GFX9-NEXT: s_lshr_b32 s16, s2, 8
; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s9
; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s8
; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s16
@ -1987,21 +1987,21 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v4
; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: s_lshr_b32 s3, s0, 20
; GFX9-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-NEXT: s_lshr_b32 s10, s1, 20
; GFX9-NEXT: s_lshr_b32 s11, s1, 16
; GFX9-NEXT: s_lshr_b32 s3, s1, 20
; GFX9-NEXT: s_lshr_b32 s4, s1, 16
; GFX9-NEXT: s_lshr_b32 s10, s2, 20
; GFX9-NEXT: s_lshr_b32 s11, s2, 16
; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12
; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4
; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s3
; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s11
; GFX9-NEXT: v_lshlrev_b16_e64 v18, 12, s10
; GFX9-NEXT: s_lshr_b32 s5, s0, 28
; GFX9-NEXT: s_lshr_b32 s6, s0, 24
; GFX9-NEXT: s_lshr_b32 s12, s1, 28
; GFX9-NEXT: s_lshr_b32 s13, s1, 24
; GFX9-NEXT: v_and_b32_e32 v3, s2, v3
; GFX9-NEXT: s_lshr_b32 s5, s1, 28
; GFX9-NEXT: s_lshr_b32 s6, s1, 24
; GFX9-NEXT: s_lshr_b32 s12, s2, 28
; GFX9-NEXT: s_lshr_b32 s13, s2, 24
; GFX9-NEXT: v_and_b32_e32 v3, s0, v3
; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s6
; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s5
@ -2023,7 +2023,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_mul_lo_u16_e32 v8, v8, v15
; GFX9-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v4, s2, v4
; GFX9-NEXT: v_and_b32_e32 v4, s0, v4
; GFX9-NEXT: v_or_b32_e32 v6, v4, v8
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v2, v3, v2
@ -2042,24 +2042,24 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 4
; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 4
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1
; GFX9-DL-NEXT: s_lshr_b32 s7, s1, 4
; GFX9-DL-NEXT: s_lshr_b32 s14, s2, 4
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s1
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s14
; GFX9-DL-NEXT: s_lshr_b32 s8, s0, 12
; GFX9-DL-NEXT: s_lshr_b32 s9, s0, 8
; GFX9-DL-NEXT: s_lshr_b32 s15, s1, 12
; GFX9-DL-NEXT: s_lshr_b32 s16, s1, 8
; GFX9-DL-NEXT: s_lshr_b32 s8, s1, 12
; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 8
; GFX9-DL-NEXT: s_lshr_b32 s15, s2, 12
; GFX9-DL-NEXT: s_lshr_b32 s16, s2, 8
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s9
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16
@ -2075,21 +2075,21 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v4
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-DL-NEXT: s_lshr_b32 s3, s0, 20
; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16
; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 20
; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 16
; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 20
; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 16
; GFX9-DL-NEXT: s_lshr_b32 s10, s2, 20
; GFX9-DL-NEXT: s_lshr_b32 s11, s2, 16
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s11
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v18, 12, s10
; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 28
; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 24
; GFX9-DL-NEXT: s_lshr_b32 s12, s1, 28
; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 24
; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3
; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 28
; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 24
; GFX9-DL-NEXT: s_lshr_b32 s12, s2, 28
; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 24
; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3
; GFX9-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6
; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5
@ -2111,7 +2111,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v8, v8, v15
; GFX9-DL-NEXT: v_or_b32_sdwa v8, v8, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4
; GFX9-DL-NEXT: v_and_b32_e32 v4, s0, v4
; GFX9-DL-NEXT: v_or_b32_e32 v6, v4, v8
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2

View File

@ -2426,38 +2426,38 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-NEXT: s_mov_b32 s0, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40010
; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40010
; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014
; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018
; GFX9-NEXT: s_lshr_b32 s13, s1, 28
; GFX9-NEXT: s_and_b32 s14, s1, 15
; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004
; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40008
; GFX9-NEXT: s_bfe_u32 s3, s1, 0x40010
; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010
; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40014
; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40018
; GFX9-NEXT: s_lshr_b32 s13, s2, 28
; GFX9-NEXT: s_and_b32 s14, s2, 15
; GFX9-NEXT: s_bfe_u32 s15, s2, 0x40004
; GFX9-NEXT: s_bfe_u32 s16, s2, 0x40008
; GFX9-NEXT: v_mov_b32_e32 v3, s10
; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c
; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014
; GFX9-NEXT: s_bfe_u32 s2, s2, 0x4000c
; GFX9-NEXT: s_bfe_u32 s4, s1, 0x40014
; GFX9-NEXT: v_mov_b32_e32 v4, s11
; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018
; GFX9-NEXT: s_bfe_u32 s5, s1, 0x40018
; GFX9-NEXT: v_mov_b32_e32 v5, s12
; GFX9-NEXT: s_lshr_b32 s6, s0, 28
; GFX9-NEXT: s_lshr_b32 s6, s1, 28
; GFX9-NEXT: v_mov_b32_e32 v6, s13
; GFX9-NEXT: s_and_b32 s7, s0, 15
; GFX9-NEXT: s_and_b32 s7, s1, 15
; GFX9-NEXT: v_mov_b32_e32 v7, s14
; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004
; GFX9-NEXT: s_bfe_u32 s8, s1, 0x40004
; GFX9-NEXT: v_mov_b32_e32 v8, s15
; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008
; GFX9-NEXT: s_bfe_u32 s9, s1, 0x40008
; GFX9-NEXT: v_mov_b32_e32 v9, s16
; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c
; GFX9-NEXT: v_mov_b32_e32 v10, s1
; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c
; GFX9-NEXT: v_mov_b32_e32 v10, s2
; GFX9-NEXT: v_mul_lo_u16_e32 v3, s3, v3
; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_mul_lo_u16_e32 v5, s5, v5
@ -2468,12 +2468,12 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v5, v7, v8
; GFX9-NEXT: v_mul_lo_u16_e32 v9, s9, v9
; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v5, s2, v5
; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_and_b32_e32 v5, s0, v5
; GFX9-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-NEXT: v_or_b32_e32 v6, v5, v6
; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v6
; GFX9-NEXT: v_and_b32_e32 v3, s2, v3
; GFX9-NEXT: v_and_b32_e32 v3, s0, v3
; GFX9-NEXT: v_or_b32_e32 v4, v3, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
@ -2492,38 +2492,38 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40010
; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40010
; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014
; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018
; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 28
; GFX9-DL-NEXT: s_and_b32 s14, s1, 15
; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004
; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40008
; GFX9-DL-NEXT: s_bfe_u32 s3, s1, 0x40010
; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010
; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40014
; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40018
; GFX9-DL-NEXT: s_lshr_b32 s13, s2, 28
; GFX9-DL-NEXT: s_and_b32 s14, s2, 15
; GFX9-DL-NEXT: s_bfe_u32 s15, s2, 0x40004
; GFX9-DL-NEXT: s_bfe_u32 s16, s2, 0x40008
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10
; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c
; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014
; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x4000c
; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x40014
; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11
; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018
; GFX9-DL-NEXT: s_bfe_u32 s5, s1, 0x40018
; GFX9-DL-NEXT: v_mov_b32_e32 v5, s12
; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 28
; GFX9-DL-NEXT: s_lshr_b32 s6, s1, 28
; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13
; GFX9-DL-NEXT: s_and_b32 s7, s0, 15
; GFX9-DL-NEXT: s_and_b32 s7, s1, 15
; GFX9-DL-NEXT: v_mov_b32_e32 v7, s14
; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004
; GFX9-DL-NEXT: s_bfe_u32 s8, s1, 0x40004
; GFX9-DL-NEXT: v_mov_b32_e32 v8, s15
; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008
; GFX9-DL-NEXT: s_bfe_u32 s9, s1, 0x40008
; GFX9-DL-NEXT: v_mov_b32_e32 v9, s16
; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c
; GFX9-DL-NEXT: v_mov_b32_e32 v10, s1
; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c
; GFX9-DL-NEXT: v_mov_b32_e32 v10, s2
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s3, v3
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s5, v5
@ -2534,12 +2534,12 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_or_b32_e32 v5, v7, v8
; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s9, v9
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5
; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s1, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_and_b32_e32 v5, s0, v5
; GFX9-DL-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX9-DL-NEXT: v_or_b32_e32 v6, v5, v6
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v6
; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3
; GFX9-DL-NEXT: v_and_b32_e32 v3, s0, v3
; GFX9-DL-NEXT: v_or_b32_e32 v4, v3, v4
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2

View File

@ -73,12 +73,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_load_dword s0, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_and_b32 s1, s4, 0xffff
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s0, s0, 0xffff0000
; CI-NEXT: s_and_b32 s0, s2, 0xffff0000
; CI-NEXT: s_or_b32 s0, s1, s0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dword v[0:1], v2
@ -95,11 +95,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-NEXT: s_lshr_b32 s0, s2, 16
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: global_store_dword v[0:1], v2, off
@ -133,18 +133,18 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_load_dword s0, s[2:3], 0x0
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_and_b32 s1, s4, 0xffff
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_and_b32 s0, s4, 0xffff
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s0, s0, 16
; CI-NEXT: s_lshl_b32 s2, s0, 16
; CI-NEXT: s_or_b32 s1, s1, s2
; CI-NEXT: v_mov_b32_e32 v2, s1
; CI-NEXT: s_lshr_b32 s1, s2, 16
; CI-NEXT: s_lshl_b32 s2, s1, 16
; CI-NEXT: s_or_b32 s0, s0, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: ;;#ASMSTART
; CI-NEXT: ; use s0
; CI-NEXT: ; use s1
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
@ -192,12 +192,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_load_dword s0, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_lshr_b32 s1, s4, 16
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s0, s0, 0xffff0000
; CI-NEXT: s_and_b32 s0, s2, 0xffff0000
; CI-NEXT: s_or_b32 s0, s1, s0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dword v[0:1], v2
@ -216,16 +216,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_load_dword s0, s[2:3], 0x0
; GFX9-NEXT: s_lshr_b32 s0, s4, 16
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_lshr_b32 s1, s4, 16
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_pack_lh_b32_b16 s0, s1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: s_pack_lh_b32_b16 s1, s0, s2
; GFX9-NEXT: v_mov_b32_e32 v2, s1
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use s1
; GFX9-NEXT: ; use s0
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_endpgm
;
@ -234,17 +234,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a
; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x10
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_load_dword s0, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_lshr_b32 s1, s4, 16
; VI-NEXT: s_lshr_b32 s0, s4, 16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s0, s0, 0xffff0000
; VI-NEXT: s_or_b32 s0, s1, s0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: s_and_b32 s1, s2, 0xffff0000
; VI-NEXT: s_or_b32 s1, s0, s1
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: ;;#ASMSTART
; VI-NEXT: ; use s1
; VI-NEXT: ; use s0
; VI-NEXT: ;;#ASMEND
; VI-NEXT: s_endpgm
;
@ -253,17 +253,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_load_dword s0, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_lshr_b32 s1, s4, 16
; CI-NEXT: s_lshr_b32 s0, s4, 16
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s0, s0, 0xffff0000
; CI-NEXT: s_or_b32 s0, s1, s0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: s_and_b32 s1, s2, 0xffff0000
; CI-NEXT: s_or_b32 s1, s0, s1
; CI-NEXT: v_mov_b32_e32 v2, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: ;;#ASMSTART
; CI-NEXT: ; use s1
; CI-NEXT: ; use s0
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
@ -426,12 +426,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_load_dword s0, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_lshl_b32 s1, s4, 16
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s0, s0, 0xffff
; CI-NEXT: s_and_b32 s0, s2, 0xffff
; CI-NEXT: s_or_b32 s0, s0, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dword v[0:1], v2
@ -624,15 +624,15 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v0, v[0:1]
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: flat_load_dword v3, v[0:1]
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_lshr_b32 s0, s4, 16
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; CI-NEXT: v_or_b32_e32 v0, s0, v0
; CI-NEXT: flat_store_dword v[2:3], v0
; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; CI-NEXT: v_or_b32_e32 v2, s0, v2
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
@ -849,15 +849,15 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v1, 0x4500
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GFX9-NEXT: v_mov_b32_e32 v3, 0x4500
; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v2f16_0:
@ -1107,13 +1107,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)*
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s1, s[2:3], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_lshl_b32 s0, s0, 4
; GFX9-NEXT: s_lshl_b32 s0, s4, 4
; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_bfi_b32 v2, s0, v2, v3
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
@ -1125,13 +1125,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)*
; VI-NEXT: v_mov_b32_e32 v2, 0x3e703e7
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_load_dword s4, s[4:5], 0x0
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_load_dword s0, s[4:5], 0x0
; VI-NEXT: s_load_dword s1, s[2:3], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s0, s0, 4
; VI-NEXT: s_lshl_b32 s0, s4, 4
; VI-NEXT: s_lshl_b32 s0, 0xffff, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: v_bfi_b32 v2, s0, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@ -1143,13 +1143,13 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)*
; CI-NEXT: v_mov_b32_e32 v2, 0x3e703e7
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_load_dword s4, s[4:5], 0x0
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_load_dword s0, s[4:5], 0x0
; CI-NEXT: s_load_dword s1, s[2:3], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshl_b32 s0, s0, 4
; CI-NEXT: s_lshl_b32 s0, s4, 4
; CI-NEXT: s_lshl_b32 s0, 0xffff, s0
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: v_mov_b32_e32 v3, s2
; CI-NEXT: v_bfi_b32 v2, s0, v2, v3
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@ -1240,24 +1240,25 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4
; GFX9-NEXT: s_mov_b32 s0, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1
; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s0
; GFX9-NEXT: global_load_dword v2, v[2:3], off
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: s_mov_b32 s0, 0x12341234
; GFX9-NEXT: v_bfi_b32 v0, v1, s0, v0
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfi_b32 v2, v2, s0, v3
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
@ -1266,24 +1267,25 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
; VI-NEXT: s_mov_b32 s0, 0xffff
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v1, 4, v1
; VI-NEXT: v_lshlrev_b32_e64 v1, v1, s0
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_mov_b32 s2, 0xffff
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_mov_b32 s0, 0x12341234
; VI-NEXT: v_bfi_b32 v0, v1, s0, v0
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; VI-NEXT: v_lshlrev_b32_e64 v2, v2, s2
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_bfi_b32 v2, v2, s0, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_insertelement_v2f16_dynamic_vgpr:
@ -1299,17 +1301,17 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_load_dword v2, v[2:3]
; CI-NEXT: flat_load_dword v0, v[0:1]
; CI-NEXT: v_add_i32_e32 v4, vcc, s0, v4
; CI-NEXT: v_mov_b32_e32 v5, s1
; CI-NEXT: flat_load_dword v3, v[0:1]
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v4
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_mov_b32 s0, 0x12341234
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v2
; CI-NEXT: v_lshl_b32_e32 v1, 0xffff, v1
; CI-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; CI-NEXT: v_lshl_b32_e32 v2, 0xffff, v2
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: v_bfi_b32 v0, v1, s0, v0
; CI-NEXT: flat_store_dword v[4:5], v0
; CI-NEXT: v_bfi_b32 v2, v2, s0, v3
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
@ -1684,26 +1686,26 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v4, v[0:1], off
; GFX9-NEXT: global_load_dword v2, v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: s_mov_b32 s0, 0xffff
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4
; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v4
; GFX9-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1]
; GFX9-NEXT: s_pack_ll_b32_b16 s0, s4, s4
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfi_b32 v1, v5, s0, v1
; GFX9-NEXT: v_bfi_b32 v0, v4, s0, v0
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: v_bfi_b32 v1, v3, s1, v1
; GFX9-NEXT: v_bfi_b32 v0, v2, s1, v0
; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
@ -1717,17 +1719,17 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: s_mov_b32 s0, 0xffff
; VI-NEXT: s_mov_b32 s2, 0xffff
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_and_b32 s2, s4, s0
; VI-NEXT: s_mov_b32 s1, 0
; VI-NEXT: s_lshl_b32 s3, s2, 16
; VI-NEXT: s_mov_b32 s3, 0
; VI-NEXT: s_and_b32 s1, s4, s2
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: s_lshl_b32 s0, s1, 16
; VI-NEXT: s_or_b32 s0, s1, s0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[0:1]
; VI-NEXT: s_or_b32 s0, s2, s3
; VI-NEXT: v_lshlrev_b64 v[4:5], v4, s[2:3]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_bfi_b32 v1, v5, s0, v1
; VI-NEXT: v_bfi_b32 v0, v4, s0, v0
@ -1736,26 +1738,26 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspac
;
; CI-LABEL: v_insertelement_v4i16_dynamic_vgpr:
; CI: ; %bb.0:
; CI-NEXT: flat_load_dword v4, v[0:1]
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; CI-NEXT: s_mov_b32 s6, 0xffff
; CI-NEXT: s_mov_b32 s7, 0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v4, v[0:1]
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: s_mov_b32 s2, 0xffff
; CI-NEXT: s_mov_b32 s3, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: s_lshl_b32 s1, s4, 16
; CI-NEXT: s_and_b32 s3, s4, s6
; CI-NEXT: s_and_b32 s4, s4, s2
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; CI-NEXT: s_or_b32 s0, s3, s1
; CI-NEXT: s_or_b32 s0, s4, s1
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v4
; CI-NEXT: v_lshl_b64 v[4:5], s[6:7], v4
; CI-NEXT: v_lshl_b64 v[4:5], s[2:3], v4
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: v_bfi_b32 v1, v5, s0, v1
; CI-NEXT: v_bfi_b32 v0, v4, s0, v0
@ -1785,19 +1787,19 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4
; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: s_mov_b32 s3, 0
; GFX9-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NEXT: s_lshl_b32 s1, s5, 4
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: s_mov_b32 s0, 0xffff
; GFX9-NEXT: s_lshl_b32 s3, s5, 4
; GFX9-NEXT: v_mov_b32_e32 v4, s2
; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], s3
; GFX9-NEXT: v_mov_b32_e32 v5, s2
; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], s1
; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v5, s4
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfi_b32 v1, s1, v5, v1
; GFX9-NEXT: v_bfi_b32 v0, s0, v4, v0
; GFX9-NEXT: v_bfi_b32 v1, s1, v4, v1
; GFX9-NEXT: v_bfi_b32 v0, s0, v5, v0
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: s_endpgm
;
@ -1807,19 +1809,19 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: s_mov_b32 s0, 0xffff
; VI-NEXT: s_mov_b32 s2, 0xffff
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: s_mov_b32 s1, 0
; VI-NEXT: s_lshl_b32 s2, s5, 4
; VI-NEXT: s_and_b32 s3, s4, s0
; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], s2
; VI-NEXT: s_lshl_b32 s2, s3, 16
; VI-NEXT: s_or_b32 s2, s3, s2
; VI-NEXT: s_mov_b32 s3, 0
; VI-NEXT: s_lshl_b32 s1, s5, 4
; VI-NEXT: s_and_b32 s4, s4, s2
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1
; VI-NEXT: s_lshl_b32 s2, s4, 16
; VI-NEXT: s_or_b32 s2, s4, s2
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: v_mov_b32_e32 v5, s2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@ -1839,15 +1841,15 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspa
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: s_and_b32 s2, s4, s0
; CI-NEXT: s_lshl_b32 s4, s4, 16
; CI-NEXT: s_mov_b32 s2, 0xffff
; CI-NEXT: v_mov_b32_e32 v3, s1
; CI-NEXT: s_or_b32 s2, s2, s4
; CI-NEXT: s_mov_b32 s1, 0
; CI-NEXT: s_lshl_b32 s3, s5, 4
; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], s3
; CI-NEXT: s_and_b32 s6, s4, s2
; CI-NEXT: s_mov_b32 s3, 0
; CI-NEXT: s_lshl_b32 s1, s5, 4
; CI-NEXT: s_lshl_b32 s4, s4, 16
; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v2
; CI-NEXT: s_lshl_b64 s[0:1], s[2:3], s1
; CI-NEXT: s_or_b32 s2, s6, s4
; CI-NEXT: v_mov_b32_e32 v4, s2
; CI-NEXT: v_mov_b32_e32 v5, s2
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc

View File

@ -11,6 +11,7 @@
;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen glc
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_atomic_swap v0, v[1:2], s[0:3], 0 idxen offen glc
;SICI: v_mov_b32_e32 v1, 0x2000
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_atomic_swap v0, v2, s[0:3], 0 offen offset:42 glc
;CHECK-DAG: s_waitcnt vmcnt(0)

View File

@ -79,7 +79,7 @@ main_body:
;CHECK-NOT: s_waitcnt
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 idxen glc
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 idxen glc
;CHECK: s_waitcnt vmcnt(0)
;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc

View File

@ -1559,24 +1559,22 @@ define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out
; VI-LABEL: simplify_bfe_u32_multi_use_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_mov_b32 s11, 0xf000
; VI-NEXT: s_mov_b32 s10, -1
; VI-NEXT: s_mov_b32 s2, s10
; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
; VI-NEXT: s_mov_b32 s8, s4
; VI-NEXT: s_mov_b32 s9, s5
; VI-NEXT: s_mov_b32 s0, s6
; VI-NEXT: s_mov_b32 s1, s7
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v0, 63, v0
; VI-NEXT: v_bfe_u32 v1, v0, 2, 2
; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: buffer_store_dword v1, off, s[8:11], 0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
i32 addrspace(1)* %out1,
i32 addrspace(1)* %in) #0 {

View File

@ -118,20 +118,20 @@ define amdgpu_kernel void @cos_v2f16(<2 x half> addrspace(1)* %r, <2 x half> add
; GFX9-LABEL: cos_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3118
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v0
; GFX9-NEXT: v_cos_f16_e32 v3, v1
; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_cos_f16_e32 v2, v0
; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v0
; GFX9-NEXT: v_cos_f16_e32 v2, v2
; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_cos_f16_e32 v3, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3
; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a

View File

@ -173,8 +173,8 @@ define amdgpu_kernel void @fma_v2f16(
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], s[[A_F16]], v[[B_F16_1]]
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]]
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], s[[A_F16]], v[[C_F16_1]]
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], s[[A_F16]], v[[C_V2_F16]]
; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]]
@ -198,8 +198,9 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VIGFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; GFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}}
; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
@ -243,8 +244,9 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}}
; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}

View File

@ -43,17 +43,17 @@ define amdgpu_kernel void @maxnum_f16(
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s6
; VI-NEXT: s_mov_b32 s13, s7
; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: s_waitcnt vmcnt(0)
@ -68,17 +68,17 @@ define amdgpu_kernel void @maxnum_f16(
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s14, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s12, s6
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s2
; GFX9-NEXT: s_mov_b32 s7, s3
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
@ -292,17 +292,17 @@ define amdgpu_kernel void @maxnum_v2f16(
; GFX9-LABEL: maxnum_v2f16:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
; GFX9-NEXT: v_pk_max_f16 v1, s6, s6
; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
; GFX9-NEXT: v_pk_max_f16 v0, v1, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
@ -362,18 +362,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a(
;
; GFX9-LABEL: maxnum_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s0, 0x44004200
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
; GFX9-NEXT: s_mov_b32 s4, 0x44004200
; GFX9-NEXT: v_pk_max_f16 v0, v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: v_pk_max_f16 v0, s2, s2
; GFX9-NEXT: v_pk_max_f16 v0, v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %b) #0 {
@ -429,18 +429,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b(
;
; GFX9-LABEL: maxnum_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s0, 0x42004400
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
; GFX9-NEXT: s_mov_b32 s4, 0x42004400
; GFX9-NEXT: v_pk_max_f16 v0, v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: v_pk_max_f16 v0, s2, s2
; GFX9-NEXT: v_pk_max_f16 v0, v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) #0 {
@ -735,12 +735,12 @@ define amdgpu_kernel void @fmax_v4f16_imm_a(
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
; GFX9-NEXT: v_pk_max_f16 v2, s4, s4
; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
; GFX9-NEXT: v_pk_max_f16 v1, v0, s8
; GFX9-NEXT: v_pk_max_f16 v0, v2, s9
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0

View File

@ -43,17 +43,17 @@ define amdgpu_kernel void @minnum_f16_ieee(
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s12, s6
; VI-NEXT: s_mov_b32 s13, s7
; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: s_waitcnt vmcnt(0)
@ -68,17 +68,17 @@ define amdgpu_kernel void @minnum_f16_ieee(
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s14, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s12, s6
; GFX9-NEXT: s_mov_b32 s13, s7
; GFX9-NEXT: s_mov_b32 s15, s3
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s2
; GFX9-NEXT: s_mov_b32 s7, s3
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; GFX9-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
@ -315,17 +315,17 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
; GFX9-LABEL: minnum_v2f16_ieee:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0
; GFX9-NEXT: s_load_dword s7, s[0:1], 0x0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NEXT: s_load_dword s5, s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v1, s4, s4
; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
; GFX9-NEXT: v_pk_max_f16 v1, s6, s6
; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
; GFX9-NEXT: v_pk_min_f16 v0, v1, v0
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
@ -415,18 +415,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_a(
;
; GFX9-LABEL: minnum_v2f16_imm_a:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s0, 0x44004200
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
; GFX9-NEXT: s_mov_b32 s4, 0x44004200
; GFX9-NEXT: v_pk_min_f16 v0, v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: v_pk_max_f16 v0, s2, s2
; GFX9-NEXT: v_pk_min_f16 v0, v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %b) #0 {
@ -482,18 +482,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_b(
;
; GFX9-LABEL: minnum_v2f16_imm_b:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX9-NEXT: s_mov_b32 s4, s0
; GFX9-NEXT: s_mov_b32 s0, 0x42004400
; GFX9-NEXT: s_mov_b32 s5, s1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v0, s4, s4
; GFX9-NEXT: s_mov_b32 s4, 0x42004400
; GFX9-NEXT: v_pk_min_f16 v0, v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: v_pk_max_f16 v0, s2, s2
; GFX9-NEXT: v_pk_min_f16 v0, v0, s0
; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX9-NEXT: s_endpgm
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) #0 {
@ -788,12 +788,12 @@ define amdgpu_kernel void @fmin_v4f16_imm_a(
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_pk_max_f16 v0, s5, s5
; GFX9-NEXT: v_pk_max_f16 v2, s4, s4
; GFX9-NEXT: v_pk_max_f16 v0, s7, s7
; GFX9-NEXT: v_pk_max_f16 v2, s6, s6
; GFX9-NEXT: v_pk_min_f16 v1, v0, s8
; GFX9-NEXT: v_pk_min_f16 v0, v2, s9
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0

View File

@ -77,15 +77,15 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b64 s[0:1], s[6:7]
; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; SI-NEXT: s_movk_i32 s9, 0xfc01
; SI-NEXT: s_mov_b32 s7, 0xfffff
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_brev_b32 s8, -2
; SI-NEXT: s_movk_i32 s7, 0xfc01
; SI-NEXT: s_mov_b32 s1, 0xfffff
; SI-NEXT: s_mov_b32 s0, -1
; SI-NEXT: s_brev_b32 s6, -2
; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_bfe_u32 v4, v3, 20, 11
; SI-NEXT: v_add_i32_e32 v6, vcc, s9, v4
; SI-NEXT: v_lshr_b64 v[4:5], s[6:7], v6
; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v4
; SI-NEXT: v_lshr_b64 v[4:5], s[0:1], v6
; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3
; SI-NEXT: v_not_b32_e32 v4, v4
; SI-NEXT: v_not_b32_e32 v5, v5
@ -98,7 +98,7 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc
; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc
; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5]
; SI-NEXT: v_bfi_b32 v2, s8, v8, v3
; SI-NEXT: v_bfi_b32 v2, s6, v8, v3
; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
; SI-NEXT: s_mov_b64 s[6:7], s[2:3]
; SI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
@ -117,14 +117,14 @@ define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspa
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_brev_b32 s6, -2
; CI-NEXT: s_brev_b32 s0, -2
; CI-NEXT: v_mov_b32_e32 v8, 0x3ff00000
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_trunc_f64_e32 v[4:5], v[2:3]
; CI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5]
; CI-NEXT: v_bfi_b32 v2, s6, v8, v3
; CI-NEXT: v_bfi_b32 v2, s0, v8, v3
; CI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]

View File

@ -118,20 +118,20 @@ define amdgpu_kernel void @sin_v2f16(<2 x half> addrspace(1)* %r, <2 x half> add
; GFX9-LABEL: sin_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v2, 0x3118
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v1, 0x3118
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_f16_e32 v1, 0.15915494, v0
; GFX9-NEXT: v_sin_f16_e32 v3, v1
; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_sin_f16_e32 v2, v0
; GFX9-NEXT: v_mul_f16_e32 v2, 0.15915494, v0
; GFX9-NEXT: v_sin_f16_e32 v2, v2
; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX9-NEXT: v_sin_f16_e32 v3, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3
; GFX9-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%a.val = load <2 x half>, <2 x half> addrspace(1)* %a

View File

@ -119,12 +119,12 @@ define amdgpu_kernel void @muli24_shl64(i64 addrspace(1)* nocapture %arg, i32 ad
; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[0:1], s[6:7]
; GCN-NEXT: buffer_load_dword v1, v[1:2], s[0:3], 0 addr64
; GCN-NEXT: v_lshlrev_b32_e32 v3, 3, v0
; GCN-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64
; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-NEXT: v_mov_b32_e32 v4, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v0
; GCN-NEXT: v_or_b32_e32 v0, 0x800000, v1
; GCN-NEXT: v_mul_i32_i24_e32 v0, -7, v0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 3, v0
; GCN-NEXT: buffer_store_dwordx2 v[1:2], v[3:4], s[4:7], 0 addr64

View File

@ -87,23 +87,23 @@ define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; VI-LABEL: v_lshr_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v4, v1, v0
; VI-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v0, v4, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v5
; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_lshr_v2i16:
@ -117,14 +117,14 @@ define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_and_b32_e32 v3, s8, v3
; CI-NEXT: v_and_b32_e32 v2, s0, v2
; CI-NEXT: v_and_b32_e32 v3, s0, v3
; CI-NEXT: v_lshr_b32_e32 v2, v2, v3
; CI-NEXT: v_lshr_b32_e32 v3, v4, v5
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
@ -171,39 +171,39 @@ define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v1, s0, v0
; VI-NEXT: v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_lshrrev_b16_e32 v4, s0, v3
; VI-NEXT: v_lshrrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v2, v4, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: lshr_v_s_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_load_dword s0, s[0:1], 0xd
; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_load_dword s8, s[0:1], 0xd
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s9, s0, 16
; CI-NEXT: s_and_b32 s10, s0, s8
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: s_lshr_b32 s1, s8, 16
; CI-NEXT: s_and_b32 s8, s8, s0
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_lshrrev_b32_e32 v3, s9, v3
; CI-NEXT: v_lshrrev_b32_e32 v2, s10, v2
; CI-NEXT: v_and_b32_e32 v2, s0, v2
; CI-NEXT: v_lshrrev_b32_e32 v3, s1, v3
; CI-NEXT: v_lshrrev_b32_e32 v2, s8, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
@ -246,39 +246,39 @@ define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b16_e64 v1, v0, s0
; VI-NEXT: v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_lshrrev_b16_e64 v4, v3, s0
; VI-NEXT: v_lshrrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v4, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: lshr_s_v_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_load_dword s0, s[0:1], 0xd
; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_load_dword s8, s[0:1], 0xd
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s9, s0, 16
; CI-NEXT: s_and_b32 s10, s0, s8
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: s_lshr_b32 s1, s8, 16
; CI-NEXT: s_and_b32 s8, s8, s0
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_lshr_b32_e32 v3, s9, v3
; CI-NEXT: v_lshr_b32_e32 v2, s10, v2
; CI-NEXT: v_and_b32_e32 v2, s0, v2
; CI-NEXT: v_lshr_b32_e32 v3, s1, v3
; CI-NEXT: v_lshr_b32_e32 v2, s8, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
@ -320,15 +320,15 @@ define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b16_e64 v1, v0, 8
; VI-NEXT: v_lshrrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_lshrrev_b16_e64 v2, v3, 8
; VI-NEXT: v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: lshr_imm_v_v2i16:
@ -428,45 +428,45 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
; GFX9-LABEL: v_lshr_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v5
; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v4
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v3
; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v2
; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_lshr_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v6, v5, v1
; VI-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_lshrrev_b16_e32 v5, v4, v0
; VI-NEXT: v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_lshrrev_b16_e32 v6, v3, v1
; VI-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v0
; VI-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v1, v6, v1
; VI-NEXT: v_or_b32_e32 v0, v5, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: v_or_b32_e32 v0, v3, v0
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_lshr_v4i16:
@ -480,7 +480,7 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
@ -488,10 +488,10 @@ define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v8, 16, v4
; CI-NEXT: v_lshrrev_b32_e32 v9, 16, v5
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_and_b32_e32 v4, s8, v4
; CI-NEXT: v_and_b32_e32 v3, s8, v3
; CI-NEXT: v_and_b32_e32 v5, s8, v5
; CI-NEXT: v_and_b32_e32 v2, s0, v2
; CI-NEXT: v_and_b32_e32 v4, s0, v4
; CI-NEXT: v_and_b32_e32 v3, s0, v3
; CI-NEXT: v_and_b32_e32 v5, s0, v5
; CI-NEXT: v_lshr_b32_e32 v3, v3, v5
; CI-NEXT: v_lshr_b32_e32 v5, v7, v9
; CI-NEXT: v_lshr_b32_e32 v2, v2, v4
@ -565,13 +565,13 @@ define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 s8, 0xff00ff
; CI-NEXT: s_mov_b32 s0, 0xff00ff
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 8, v3
; CI-NEXT: v_lshrrev_b32_e32 v2, 8, v2
; CI-NEXT: v_and_b32_e32 v3, s8, v3
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_and_b32_e32 v3, s0, v3
; CI-NEXT: v_and_b32_e32 v2, s0, v2
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -39,7 +39,8 @@ define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float add
; it.
; GCN-LABEL: {{^}}madak_2_use_f32:
; GFX8_9_10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GFX9: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GFX10: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
@ -47,6 +48,7 @@ define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float add
; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
; GFX8_9_10: {{flat|global}}_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}
; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GFX8-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
; GFX10-MAD-DAG:v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
; FMA-DAG: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000

View File

@ -73,16 +73,16 @@ define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_max_i16_e32 v2, v0, v1
; VI-NEXT: v_max_i16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: v_max_i16_e32 v3, v5, v2
; VI-NEXT: v_max_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imax_sge_v2i16:
@ -124,63 +124,64 @@ define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v6, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v4
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v6
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: v_add_u32_e32 v6, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v6, v[6:7]
; VI-NEXT: flat_load_dword v7, v[0:1]
; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v4, v[4:5]
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, 4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: flat_load_dword v8, v[2:3]
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v7, v[2:3]
; VI-NEXT: flat_load_ushort v8, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v6
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; VI-NEXT: v_max_i16_e32 v0, v6, v0
; VI-NEXT: v_max_i16_e32 v6, v5, v7
; VI-NEXT: v_max_i16_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_max_i16_e32 v1, v7, v8
; VI-NEXT: v_max_i16_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v1, v1, v7
; VI-NEXT: flat_store_short v[2:3], v0
; VI-NEXT: flat_store_dword v[4:5], v1
; VI-NEXT: v_max_i16_e32 v4, v4, v8
; VI-NEXT: v_or_b32_e32 v5, v6, v5
; VI-NEXT: flat_store_short v[2:3], v4
; VI-NEXT: flat_store_dword v[0:1], v5
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_imax_sge_v3i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: v_mov_b32_e32 v6, 0
; GFX9-NEXT: v_mov_b32_e32 v7, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v5
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v5
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_short_d16 v7, v[0:1], off offset:4
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_short_d16 v6, v[2:3], off offset:4
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: global_load_short_d16 v6, v[0:1], off offset:4
; GFX9-NEXT: global_load_dword v7, v[0:1], off
; GFX9-NEXT: global_load_short_d16 v4, v[2:3], off offset:4
; GFX9-NEXT: global_load_dword v2, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v5
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_pk_max_i16 v3, v6, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_max_i16 v0, v0, v1
; GFX9-NEXT: v_pk_max_i16 v1, v7, v6
; GFX9-NEXT: global_store_short v[4:5], v1, off offset:4
; GFX9-NEXT: global_store_dword v[4:5], v0, off
; GFX9-NEXT: v_pk_max_i16 v2, v7, v2
; GFX9-NEXT: global_store_short v[0:1], v3, off offset:4
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid
@ -441,16 +442,16 @@ define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_add_u32_e32 v4, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_max_u16_e32 v2, v0, v1
; VI-NEXT: v_max_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: flat_store_dword v[4:5], v0
; VI-NEXT: v_max_u16_e32 v3, v5, v2
; VI-NEXT: v_max_u16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_umax_ugt_v2i16:

View File

@ -108,46 +108,56 @@ define void @mubuf_clause(<4 x i32> addrspace(5)* noalias nocapture readonly %ar
; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v2
; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v2
; GCN-NEXT: v_add_u32_e32 v0, v0, v2
; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:8
; GCN-NEXT: buffer_load_dword v6, v0, s[0:3], 0 offen offset:12
; GCN-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen offset:16
; GCN-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen offset:20
; GCN-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen offset:24
; GCN-NEXT: buffer_load_dword v10, v0, s[0:3], 0 offen offset:28
; GCN-NEXT: buffer_load_dword v11, v0, s[0:3], 0 offen offset:32
; GCN-NEXT: buffer_load_dword v12, v0, s[0:3], 0 offen offset:36
; GCN-NEXT: buffer_load_dword v13, v0, s[0:3], 0 offen offset:40
; GCN-NEXT: buffer_load_dword v14, v0, s[0:3], 0 offen offset:44
; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], 0 offen offset:48
; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen offset:52
; GCN-NEXT: buffer_load_dword v17, v0, s[0:3], 0 offen offset:56
; GCN-NEXT: v_add_u32_e32 v1, v1, v2
; GCN-NEXT: buffer_load_dword v6, v0, s[0:3], 0 offen offset:20
; GCN-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen offset:24
; GCN-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen offset:28
; GCN-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen offset:32
; GCN-NEXT: buffer_load_dword v10, v0, s[0:3], 0 offen offset:36
; GCN-NEXT: buffer_load_dword v11, v0, s[0:3], 0 offen offset:40
; GCN-NEXT: buffer_load_dword v12, v0, s[0:3], 0 offen offset:44
; GCN-NEXT: buffer_load_dword v13, v0, s[0:3], 0 offen offset:48
; GCN-NEXT: buffer_load_dword v14, v0, s[0:3], 0 offen offset:52
; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], 0 offen offset:56
; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen offset:60
; GCN-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen
; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4
; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:8
; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:12
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:60
; GCN-NEXT: s_nop 0
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16
; GCN-NEXT: s_nop 0
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen offset:4
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:8
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:12
; GCN-NEXT: s_waitcnt vmcnt(4)
; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen offset:20
; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:24
; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:28
; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen offset:32
; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen offset:36
; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen offset:40
; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen offset:44
; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen offset:48
; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen offset:52
; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen offset:56
; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen offset:60
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:4
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:8
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen offset:12
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:16
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:20
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen offset:24
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen offset:28
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen offset:32
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen offset:36
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen offset:40
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen offset:44
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen offset:48
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen offset:52
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], 0 offen offset:56
; GCN-NEXT: s_waitcnt vmcnt(15)
; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:60
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
bb:

View File

@ -57,7 +57,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
; GFX9-NEXT: v_and_b32_e32 v5, 1, v18
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5
; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1
; GFX9-NEXT: s_and_saveexec_b64 s[10:11], s[4:5]
; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
; GFX9-NEXT: s_cbranch_execz BB1_3
; GFX9-NEXT: ; %bb.1: ; %bb19
; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6
@ -67,7 +67,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
; GFX9-NEXT: v_rcp_iflag_f32_e32 v4, v7
; GFX9-NEXT: v_lshlrev_b32_e32 v6, 2, v2
; GFX9-NEXT: v_add_u32_e32 v7, v17, v12
; GFX9-NEXT: s_mov_b64 s[12:13], 0
; GFX9-NEXT: s_mov_b64 s[10:11], 0
; GFX9-NEXT: BB1_2: ; %bb23
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: v_cvt_f32_u32_e32 v8, v0
@ -76,32 +76,32 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, float addrspace(3
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
; GFX9-NEXT: v_madak_f32 v8, v8, v4, 0x3727c5ac
; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8
; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1
; GFX9-NEXT: v_mul_u32_u24_e32 v18, v8, v5
; GFX9-NEXT: v_add_u32_e32 v8, v8, v16
; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v8, v13
; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v8, v13
; GFX9-NEXT: v_mul_lo_u32 v8, v8, v15
; GFX9-NEXT: v_sub_u32_e32 v19, v9, v18
; GFX9-NEXT: v_cmp_lt_u32_e64 s[8:9], v19, v14
; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], s[8:9]
; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v19, v14
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18
; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], vcc
; GFX9-NEXT: v_add_u32_e32 v8, v12, v8
; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX9-NEXT: v_mov_b32_e32 v9, 0
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
; GFX9-NEXT: v_lshlrev_b64 v[8:9], 2, v[8:9]
; GFX9-NEXT: s_or_b64 s[12:13], s[4:5], s[12:13]
; GFX9-NEXT: v_add_co_u32_e64 v8, s[4:5], v10, v8
; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], v11, v9, s[4:5]
; GFX9-NEXT: v_add_co_u32_e64 v8, s[6:7], v10, v8
; GFX9-NEXT: v_addc_co_u32_e64 v9, s[6:7], v11, v9, s[6:7]
; GFX9-NEXT: global_load_dword v8, v[8:9], off
; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1
; GFX9-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[6:7]
; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5]
; GFX9-NEXT: ds_write_b32 v3, v8
; GFX9-NEXT: v_add_u32_e32 v3, v3, v6
; GFX9-NEXT: s_andn2_b64 exec, exec, s[12:13]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[10:11]
; GFX9-NEXT: s_cbranch_execnz BB1_2
; GFX9-NEXT: BB1_3: ; %Flow3
; GFX9-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX9-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
bb:

View File

@ -98,17 +98,17 @@ define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) {
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
;
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
%conv = and i64 %call, 255

View File

@ -166,20 +166,18 @@ define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s2, s10
; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v2, vcc, v1, v0
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
@ -187,19 +185,19 @@ define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0
; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_saddo_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_mov_b32_e32 v6, s6
; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: flat_load_dword v4, v[4:5]
; VI-NEXT: flat_load_dword v5, v[6:7]
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
@ -218,12 +216,12 @@ define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_mov_b32_e32 v6, s6
; GFX9-NEXT: v_mov_b32_e32 v7, s7
; GFX9-NEXT: global_load_dword v4, v[4:5], off
; GFX9-NEXT: global_load_dword v5, v[6:7], off
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: global_load_dword v4, v[0:1], off
; GFX9-NEXT: global_load_dword v5, v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@ -335,20 +333,18 @@ define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s2, s10
; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2
; SI-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc
@ -357,57 +353,57 @@ define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0
; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_saddo_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_mov_b32_e32 v6, s6
; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v6
; VI-NEXT: v_addc_u32_e32 v9, vcc, v5, v7, vcc
; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[4:5]
; VI-NEXT: flat_store_dwordx2 v[0:1], v[8:9]
; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2
; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc
; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; VI-NEXT: flat_store_byte v[2:3], v0
; VI-NEXT: flat_store_byte v[6:7], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_saddo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_mov_b32_e32 v6, s6
; GFX9-NEXT: v_mov_b32_e32 v7, s7
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[4:5], off
; GFX9-NEXT: global_load_dwordx2 v[6:7], v[6:7], off
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_mov_b32_e32 v6, s2
; GFX9-NEXT: v_mov_b32_e32 v7, s3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v4, v6
; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v5, v7, vcc
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[4:5]
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v3, vcc
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1]
; GFX9-NEXT: global_store_dwordx2 v[4:5], v[8:9], off
; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX9-NEXT: global_store_byte v[2:3], v0, off
; GFX9-NEXT: global_store_byte v[6:7], v0, off
; GFX9-NEXT: s_endpgm
%a = load i64, i64 addrspace(1)* %aptr, align 4
%b = load i64, i64 addrspace(1)* %bptr, align 4
@ -428,20 +424,18 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s2, s10
; SI-NEXT: s_mov_b32 s3, s11
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_add_i32_e32 v5, vcc, v1, v3
; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2
@ -461,58 +455,58 @@ define amdgpu_kernel void @v_saddo_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_mov_b32_e32 v6, s6
; VI-NEXT: v_mov_b32_e32 v7, s7
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: flat_load_dwordx2 v[6:7], v[6:7]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v9, vcc, v5, v7
; VI-NEXT: v_add_u32_e32 v8, vcc, v4, v6
; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v7
; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v5
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6
; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v4
; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3
; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2
; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3
; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1
; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; VI-NEXT: flat_store_dwordx2 v[0:1], v[8:9]
; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0
; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9]
; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_saddo_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NEXT: v_mov_b32_e32 v5, s5
; GFX9-NEXT: v_mov_b32_e32 v6, s6
; GFX9-NEXT: v_mov_b32_e32 v7, s7
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[4:5], off
; GFX9-NEXT: global_load_dwordx2 v[6:7], v[6:7], off
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s2
; GFX9-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s6
; GFX9-NEXT: v_mov_b32_e32 v3, s7
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX9-NEXT: v_mov_b32_e32 v4, s0
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_mov_b32_e32 v6, s2
; GFX9-NEXT: v_mov_b32_e32 v7, s3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_add_u32_e32 v9, v5, v7
; GFX9-NEXT: v_add_u32_e32 v8, v4, v6
; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v7
; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v5
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6
; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v4
; GFX9-NEXT: v_add_u32_e32 v9, v1, v3
; GFX9-NEXT: v_add_u32_e32 v8, v0, v2
; GFX9-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3
; GFX9-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1
; GFX9-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5]
; GFX9-NEXT: global_store_dwordx2 v[0:1], v[8:9], off
; GFX9-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GFX9-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[2:3]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: global_store_dwordx2 v[4:5], v[8:9], off
; GFX9-NEXT: global_store_dwordx2 v[6:7], v[0:1], off
; GFX9-NEXT: s_endpgm
%a = load <2 x i32>, <2 x i32> addrspace(1)* %aptr, align 4
%b = load <2 x i32>, <2 x i32> addrspace(1)* %bptr, align 4

View File

@ -173,9 +173,9 @@ entry:
; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
; CI-NOHSA-NOT: v_add
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
; CI-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
@ -205,8 +205,8 @@ entry:
; SI: s_mov_b32 {{s[0-9]+}}, 0x13480
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
; CI-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}

View File

@ -36,10 +36,10 @@ body: |
; CHECK: INLINEASM &"", 1 /* sideeffect attdialect */, 851978 /* regdef:VGPR_LO16 */, def dead [[COPY1]], 851978 /* regdef:VGPR_LO16 */, def dead [[COPY]].sub1, 2147483657 /* reguse tiedto:$0 */, [[COPY1]], 2147549193 /* reguse tiedto:$1 */, [[COPY]].sub1
; CHECK: %11.sub0:vreg_512 = COPY [[COPY]].sub0
; CHECK: %11.sub3:vreg_512 = COPY [[COPY]].sub3
; CHECK: dead %10:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec
; CHECK: %11.sub2:vreg_512 = COPY undef [[V_MOV_B32_e32_]]
; CHECK: %11.sub5:vreg_512 = COPY undef [[V_MOV_B32_e32_]]
; CHECK: [[COPY2:%[0-9]+]]:vreg_512 = COPY %11
; CHECK: dead %10:vgpr_32 = V_ADD_CO_U32_e32 4, [[V_MOV_B32_e32_1]], implicit-def dead $vcc, implicit $exec
; CHECK: S_BRANCH %bb.1
bb.0:
liveins: $sgpr6_sgpr7

View File

@ -25,6 +25,10 @@ body: |
; CHECK: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[DEF]], 0, 0, 0, 0, implicit $exec
; CHECK: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[DEF]], 8, 0, 0, 0, implicit $exec
; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]]
; CHECK: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec
; CHECK: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec
; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec
; CHECK: undef %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
; CHECK: [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
@ -32,11 +36,7 @@ body: |
; CHECK: undef %11.sub1:vreg_64 = IMPLICIT_DEF
; CHECK: [[DEF4:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK: [[DEF5:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK: [[COPY1:%[0-9]+]]:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_]]
; CHECK: undef %6.sub0:vreg_64 = V_ADD_F32_e32 [[DEF]].sub0, [[COPY1]].sub0, implicit $mode, implicit $exec
; CHECK: dead undef %6.sub1:vreg_64 = V_ADD_F32_e32 [[DEF]].sub1, [[COPY1]].sub0, implicit $mode, implicit $exec
; CHECK: [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: [[GLOBAL_LOAD_DWORD1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY1]], 0, 0, 0, 0, implicit $exec
; CHECK: undef %19.sub0:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD1]], [[GLOBAL_LOAD_DWORDX2_]].sub0, implicit $mode, implicit $exec
; CHECK: [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
; CHECK: %19.sub1:vreg_64 = V_ADD_F32_e32 [[GLOBAL_LOAD_DWORD]], [[GLOBAL_LOAD_DWORD]], implicit $mode, implicit $exec

View File

@ -203,14 +203,14 @@ define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)*
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s10, s2
; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s8, s6
; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s7
; GCN-NEXT: s_mov_b32 s6, s2
; GCN-NEXT: s_mov_b32 s7, s3
; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NEXT: v_lshrrev_b32_e32 v1, 30, v1
@ -224,14 +224,14 @@ define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)*
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s3, 0xf000
; TONGA-NEXT: s_mov_b32 s2, -1
; TONGA-NEXT: s_mov_b32 s10, s2
; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
; TONGA-NEXT: s_mov_b32 s8, s6
; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_dword v0, off, s[8:11], 0
; TONGA-NEXT: s_mov_b32 s0, s4
; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_mov_b32 s4, s6
; TONGA-NEXT: s_mov_b32 s5, s7
; TONGA-NEXT: s_mov_b32 s6, s2
; TONGA-NEXT: s_mov_b32 s7, s3
; TONGA-NEXT: buffer_load_dword v0, off, s[4:7], 0
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; TONGA-NEXT: v_lshrrev_b32_e32 v1, 30, v1
@ -694,14 +694,14 @@ define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32>
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s10, s2
; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s8, s6
; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s7
; GCN-NEXT: s_mov_b32 s6, s2
; GCN-NEXT: s_mov_b32 s7, s3
; GCN-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v3, 31, v1
@ -719,14 +719,14 @@ define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32>
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s3, 0xf000
; TONGA-NEXT: s_mov_b32 s2, -1
; TONGA-NEXT: s_mov_b32 s10, s2
; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
; TONGA-NEXT: s_mov_b32 s8, s6
; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; TONGA-NEXT: s_mov_b32 s0, s4
; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_mov_b32 s4, s6
; TONGA-NEXT: s_mov_b32 s5, s7
; TONGA-NEXT: s_mov_b32 s6, s2
; TONGA-NEXT: s_mov_b32 s7, s3
; TONGA-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v2, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v3, 31, v1
@ -744,14 +744,14 @@ define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32>
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s6
; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s2
; GFX9-NEXT: s_mov_b32 s7, s3
; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v1
@ -1073,16 +1073,16 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s11, 0xf000
; GFX9-NEXT: s_mov_b32 s10, -1
; GFX9-NEXT: s_mov_b32 s4, 0x4f7ffffe
; GFX9-NEXT: s_mov_b32 s6, s10
; GFX9-NEXT: s_mov_b32 s7, s11
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s4, s2
; GFX9-NEXT: s_mov_b32 s5, s3
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GFX9-NEXT: s_mov_b32 s2, 0x4f7ffffe
; GFX9-NEXT: s_mov_b32 s8, s0
; GFX9-NEXT: s_mov_b32 s9, s1
; GFX9-NEXT: s_mov_b32 s0, s2
; GFX9-NEXT: s_mov_b32 s1, s3
; GFX9-NEXT: s_mov_b32 s2, s10
; GFX9-NEXT: s_mov_b32 s3, s11
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_ashrrev_i32_e32 v8, 31, v0
; GFX9-NEXT: s_waitcnt vmcnt(0)
@ -1120,14 +1120,14 @@ define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> ad
; GFX9-NEXT: v_cvt_f32_u32_e32 v14, v7
; GFX9-NEXT: v_rcp_iflag_f32_e32 v10, v10
; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v12
; GFX9-NEXT: v_mul_f32_e32 v8, s4, v8
; GFX9-NEXT: v_mul_f32_e32 v8, s2, v8
; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v14
; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v8
; GFX9-NEXT: v_mul_f32_e32 v10, s4, v10
; GFX9-NEXT: v_mul_f32_e32 v12, s4, v12
; GFX9-NEXT: v_mul_f32_e32 v10, s2, v10
; GFX9-NEXT: v_mul_f32_e32 v12, s2, v12
; GFX9-NEXT: v_cvt_u32_f32_e32 v10, v10
; GFX9-NEXT: v_sub_u32_e32 v9, 0, v4
; GFX9-NEXT: v_mul_f32_e32 v14, s4, v14
; GFX9-NEXT: v_mul_f32_e32 v14, s2, v14
; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12
; GFX9-NEXT: v_mul_lo_u32 v9, v9, v8
; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14
@ -1330,14 +1330,14 @@ define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32>
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s10, s2
; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s8, s6
; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s7
; GCN-NEXT: s_mov_b32 s6, s2
; GCN-NEXT: s_mov_b32 s7, s3
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v5, 31, v1
@ -1363,14 +1363,14 @@ define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32>
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s3, 0xf000
; TONGA-NEXT: s_mov_b32 s2, -1
; TONGA-NEXT: s_mov_b32 s10, s2
; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
; TONGA-NEXT: s_mov_b32 s8, s6
; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; TONGA-NEXT: s_mov_b32 s0, s4
; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_mov_b32 s4, s6
; TONGA-NEXT: s_mov_b32 s5, s7
; TONGA-NEXT: s_mov_b32 s6, s2
; TONGA-NEXT: s_mov_b32 s7, s3
; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v5, 31, v1
@ -1396,14 +1396,14 @@ define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32>
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s6
; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s2
; GFX9-NEXT: s_mov_b32 s7, s3
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_ashrrev_i32_e32 v4, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v5, 31, v1
@ -1619,17 +1619,17 @@ define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)*
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s10, s2
; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s8, s6
; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; GCN-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
; GCN-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s7
; GCN-NEXT: s_mov_b32 s6, s2
; GCN-NEXT: s_mov_b32 s7, s3
; GCN-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; GCN-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:2
; GCN-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4
; GCN-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:6
; GCN-NEXT: s_waitcnt vmcnt(2)
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
@ -1660,17 +1660,17 @@ define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)*
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s3, 0xf000
; TONGA-NEXT: s_mov_b32 s2, -1
; TONGA-NEXT: s_mov_b32 s10, s2
; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
; TONGA-NEXT: s_mov_b32 s8, s6
; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; TONGA-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
; TONGA-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
; TONGA-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6
; TONGA-NEXT: s_mov_b32 s0, s4
; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_mov_b32 s4, s6
; TONGA-NEXT: s_mov_b32 s5, s7
; TONGA-NEXT: s_mov_b32 s6, s2
; TONGA-NEXT: s_mov_b32 s7, s3
; TONGA-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; TONGA-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:2
; TONGA-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4
; TONGA-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:6
; TONGA-NEXT: s_waitcnt vmcnt(2)
; TONGA-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; TONGA-NEXT: v_or_b32_e32 v0, v0, v1
@ -1701,17 +1701,17 @@ define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)*
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_mov_b32 s10, s2
; GFX9-NEXT: s_mov_b32 s11, s3
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s8, s6
; GFX9-NEXT: s_mov_b32 s9, s7
; GFX9-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; GFX9-NEXT: buffer_load_ubyte v1, off, s[8:11], 0 offset:2
; GFX9-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
; GFX9-NEXT: buffer_load_ubyte v3, off, s[8:11], 0 offset:6
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s2
; GFX9-NEXT: s_mov_b32 s7, s3
; GFX9-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; GFX9-NEXT: buffer_load_ubyte v1, off, s[4:7], 0 offset:2
; GFX9-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4
; GFX9-NEXT: buffer_load_ubyte v3, off, s[4:7], 0 offset:6
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX9-NEXT: v_or_b32_e32 v0, v0, v1
@ -1802,17 +1802,17 @@ define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)*
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s10, s2
; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s8, s6
; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; GCN-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:2
; GCN-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
; GCN-NEXT: buffer_load_sbyte v3, off, s[8:11], 0 offset:6
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s7
; GCN-NEXT: s_mov_b32 s6, s2
; GCN-NEXT: s_mov_b32 s7, s3
; GCN-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; GCN-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2
; GCN-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4
; GCN-NEXT: buffer_load_sbyte v3, off, s[4:7], 0 offset:6
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; GCN-NEXT: v_or_b32_e32 v2, v2, v4
@ -1840,17 +1840,17 @@ define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)*
; TONGA-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; TONGA-NEXT: s_mov_b32 s3, 0xf000
; TONGA-NEXT: s_mov_b32 s2, -1
; TONGA-NEXT: s_mov_b32 s10, s2
; TONGA-NEXT: s_mov_b32 s11, s3
; TONGA-NEXT: s_waitcnt lgkmcnt(0)
; TONGA-NEXT: s_mov_b32 s8, s6
; TONGA-NEXT: s_mov_b32 s9, s7
; TONGA-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; TONGA-NEXT: buffer_load_sbyte v1, off, s[8:11], 0 offset:2
; TONGA-NEXT: buffer_load_ushort v2, off, s[8:11], 0 offset:4
; TONGA-NEXT: buffer_load_sbyte v3, off, s[8:11], 0 offset:6
; TONGA-NEXT: s_mov_b32 s0, s4
; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: s_mov_b32 s4, s6
; TONGA-NEXT: s_mov_b32 s5, s7
; TONGA-NEXT: s_mov_b32 s6, s2
; TONGA-NEXT: s_mov_b32 s7, s3
; TONGA-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; TONGA-NEXT: buffer_load_sbyte v1, off, s[4:7], 0 offset:2
; TONGA-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:4
; TONGA-NEXT: buffer_load_sbyte v3, off, s[4:7], 0 offset:6
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_lshlrev_b32_e32 v4, 16, v3
; TONGA-NEXT: v_or_b32_e32 v2, v2, v4
@ -2214,16 +2214,14 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocaptu
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NEXT: s_mov_b32 s0, 0x1389c755
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s7
; GCN-NEXT: s_mov_b32 s6, s2
; GCN-NEXT: s_mov_b32 s7, s3
; GCN-NEXT: s_mov_b32 s4, 0x1389c755
; GCN-NEXT: s_mov_b32 s0, s6
; GCN-NEXT: s_mov_b32 s1, s7
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mul_hi_i32 v0, v0, s0
; GCN-NEXT: v_mul_hi_i32 v1, v1, s0
; GCN-NEXT: v_mul_hi_i32 v2, v2, s0
; GCN-NEXT: v_mul_hi_i32 v3, v3, s0
; GCN-NEXT: v_mul_hi_i32 v0, v0, s4
; GCN-NEXT: v_mul_hi_i32 v1, v1, s4
; GCN-NEXT: v_mul_hi_i32 v2, v2, s4
; GCN-NEXT: v_mul_hi_i32 v3, v3, s4
; GCN-NEXT: v_lshrrev_b32_e32 v4, 31, v0
; GCN-NEXT: v_ashrrev_i32_e32 v0, 12, v0
; GCN-NEXT: v_lshrrev_b32_e32 v5, 31, v1
@ -2236,7 +2234,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocaptu
; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v5
; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v7
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NEXT: s_endpgm
;
; TONGA-LABEL: scalarize_mulhs_4xi32:
@ -2248,16 +2246,14 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocaptu
; TONGA-NEXT: s_mov_b32 s0, s4
; TONGA-NEXT: s_mov_b32 s1, s5
; TONGA-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; TONGA-NEXT: s_mov_b32 s0, 0x1389c755
; TONGA-NEXT: s_mov_b32 s4, s6
; TONGA-NEXT: s_mov_b32 s5, s7
; TONGA-NEXT: s_mov_b32 s6, s2
; TONGA-NEXT: s_mov_b32 s7, s3
; TONGA-NEXT: s_mov_b32 s4, 0x1389c755
; TONGA-NEXT: s_mov_b32 s0, s6
; TONGA-NEXT: s_mov_b32 s1, s7
; TONGA-NEXT: s_waitcnt vmcnt(0)
; TONGA-NEXT: v_mul_hi_i32 v0, v0, s0
; TONGA-NEXT: v_mul_hi_i32 v1, v1, s0
; TONGA-NEXT: v_mul_hi_i32 v2, v2, s0
; TONGA-NEXT: v_mul_hi_i32 v3, v3, s0
; TONGA-NEXT: v_mul_hi_i32 v0, v0, s4
; TONGA-NEXT: v_mul_hi_i32 v1, v1, s4
; TONGA-NEXT: v_mul_hi_i32 v2, v2, s4
; TONGA-NEXT: v_mul_hi_i32 v3, v3, s4
; TONGA-NEXT: v_lshrrev_b32_e32 v4, 31, v0
; TONGA-NEXT: v_ashrrev_i32_e32 v0, 12, v0
; TONGA-NEXT: v_lshrrev_b32_e32 v5, 31, v1
@ -2270,7 +2266,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocaptu
; TONGA-NEXT: v_add_u32_e32 v1, vcc, v1, v5
; TONGA-NEXT: v_add_u32_e32 v2, vcc, v2, v6
; TONGA-NEXT: v_add_u32_e32 v3, vcc, v3, v7
; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; TONGA-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; TONGA-NEXT: s_endpgm
;
; GFX9-LABEL: scalarize_mulhs_4xi32:
@ -2282,16 +2278,14 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocaptu
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_mov_b32 s0, 0x1389c755
; GFX9-NEXT: s_mov_b32 s4, s6
; GFX9-NEXT: s_mov_b32 s5, s7
; GFX9-NEXT: s_mov_b32 s6, s2
; GFX9-NEXT: s_mov_b32 s7, s3
; GFX9-NEXT: s_mov_b32 s4, 0x1389c755
; GFX9-NEXT: s_mov_b32 s0, s6
; GFX9-NEXT: s_mov_b32 s1, s7
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mul_hi_i32 v0, v0, s0
; GFX9-NEXT: v_mul_hi_i32 v1, v1, s0
; GFX9-NEXT: v_mul_hi_i32 v2, v2, s0
; GFX9-NEXT: v_mul_hi_i32 v3, v3, s0
; GFX9-NEXT: v_mul_hi_i32 v0, v0, s4
; GFX9-NEXT: v_mul_hi_i32 v1, v1, s4
; GFX9-NEXT: v_mul_hi_i32 v2, v2, s4
; GFX9-NEXT: v_mul_hi_i32 v3, v3, s4
; GFX9-NEXT: v_lshrrev_b32_e32 v4, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v5, 31, v1
@ -2304,7 +2298,7 @@ define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocaptu
; GFX9-NEXT: v_add_u32_e32 v1, v1, v5
; GFX9-NEXT: v_add_u32_e32 v2, v2, v6
; GFX9-NEXT: v_add_u32_e32 v3, v3, v7
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
; EG-LABEL: scalarize_mulhs_4xi32:

View File

@ -1867,56 +1867,56 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
define amdgpu_kernel void @s_test_sdiv24_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
; GCN-LABEL: s_test_sdiv24_k_num_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], 40
; GCN-NEXT: v_cvt_f32_i32_e32 v0, s6
; GCN-NEXT: s_mov_b32 s7, 0x41c00000
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_ashr_i32 s4, s6, 30
; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
; GCN-NEXT: v_cvt_f32_i32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s3, 0x41c00000
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_ashr_i32 s0, s2, 30
; GCN-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_or_b32 s6, s4, 1
; GCN-NEXT: v_mul_f32_e32 v1, s7, v1
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: s_or_b32 s2, s0, 1
; GCN-NEXT: v_mul_f32_e32 v1, s3, v1
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_mad_f32 v2, -v1, v0, s7
; GCN-NEXT: v_mad_f32 v2, -v1, v0, s3
; GCN-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
; GCN-NEXT: s_cmp_lg_u32 s4, 0
; GCN-NEXT: s_cselect_b32 s4, s6, 0
; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v1
; GCN-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
; GCN-NEXT: s_cmp_lg_u32 s0, 0
; GCN-NEXT: s_cselect_b32 s0, s2, 0
; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v1
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 24
; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; GCN-IR-LABEL: s_test_sdiv24_k_num_i64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
; GCN-IR-NEXT: s_mov_b32 s6, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_ashr_i64 s[6:7], s[6:7], 40
; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s6
; GCN-IR-NEXT: s_mov_b32 s7, 0x41c00000
; GCN-IR-NEXT: s_mov_b32 s0, s4
; GCN-IR-NEXT: s_ashr_i32 s4, s6, 30
; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 40
; GCN-IR-NEXT: v_cvt_f32_i32_e32 v0, s2
; GCN-IR-NEXT: s_mov_b32 s3, 0x41c00000
; GCN-IR-NEXT: s_mov_b32 s4, s0
; GCN-IR-NEXT: s_ashr_i32 s0, s2, 30
; GCN-IR-NEXT: v_rcp_iflag_f32_e32 v1, v0
; GCN-IR-NEXT: s_mov_b32 s1, s5
; GCN-IR-NEXT: s_or_b32 s6, s4, 1
; GCN-IR-NEXT: v_mul_f32_e32 v1, s7, v1
; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: s_or_b32 s2, s0, 1
; GCN-IR-NEXT: v_mul_f32_e32 v1, s3, v1
; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1
; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s7
; GCN-IR-NEXT: v_mad_f32 v2, -v1, v0, s3
; GCN-IR-NEXT: v_cvt_i32_f32_e32 v1, v1
; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
; GCN-IR-NEXT: s_cmp_lg_u32 s4, 0
; GCN-IR-NEXT: s_cselect_b32 s4, s6, 0
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s4, v1
; GCN-IR-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, |v0|
; GCN-IR-NEXT: s_cmp_lg_u32 s0, 0
; GCN-IR-NEXT: s_cselect_b32 s0, s2, 0
; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v1
; GCN-IR-NEXT: v_bfe_i32 v0, v0, 0, 24
; GCN-IR-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-IR-NEXT: s_endpgm
%x.shr = ashr i64 %x, 40
%result = sdiv i64 24, %x.shr

View File

@ -73,7 +73,7 @@ entry:
; GCN-LABEL: {{^}}mul_v2i16:
; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST0]], v[[DST1]]
; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[DST_SHL]]
; NOSDWA-NOT: v_mul_u32_u24_sdwa

View File

@ -52,25 +52,25 @@ define amdgpu_kernel void @select_f16(
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s18, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_mov_b32 s16, s8
; VI-NEXT: s_mov_b32 s17, s9
; VI-NEXT: s_mov_b32 s16, s6
; VI-NEXT: s_mov_b32 s17, s7
; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_mov_b32 s20, s8
; VI-NEXT: s_mov_b32 s21, s9
; VI-NEXT: s_mov_b32 s8, s10
; VI-NEXT: s_mov_b32 s9, s11
; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_mov_b32 s22, s2
; VI-NEXT: s_mov_b32 s23, s3
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
; VI-NEXT: buffer_load_ushort v1, off, s[20:23], 0
; VI-NEXT: buffer_load_ushort v2, off, s[8:11], 0
; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
; VI-NEXT: s_waitcnt vmcnt(0)
@ -137,21 +137,21 @@ define amdgpu_kernel void @select_f16_imm_a(
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_mov_b32 s0, s2
; VI-NEXT: s_mov_b32 s1, s3
; VI-NEXT: s_mov_b32 s2, s10
; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_mov_b32 s13, s5
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
; VI-NEXT: s_waitcnt vmcnt(0)
@ -216,21 +216,21 @@ define amdgpu_kernel void @select_f16_imm_b(
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_mov_b32 s0, s2
; VI-NEXT: s_mov_b32 s1, s3
; VI-NEXT: s_mov_b32 s2, s10
; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_mov_b32 s13, s5
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
; VI-NEXT: s_waitcnt vmcnt(0)
@ -295,26 +295,26 @@ define amdgpu_kernel void @select_f16_imm_c(
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_mov_b32 s0, s2
; VI-NEXT: s_mov_b32 s1, s3
; VI-NEXT: s_mov_b32 s2, s10
; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_mov_b32 s13, s5
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0
; VI-NEXT: v_mov_b32_e32 v2, 0x3800
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
half addrspace(1)* %r,
@ -375,26 +375,26 @@ define amdgpu_kernel void @select_f16_imm_d(
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_mov_b32 s0, s2
; VI-NEXT: s_mov_b32 s1, s3
; VI-NEXT: s_mov_b32 s2, s10
; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_mov_b32 s13, s5
; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; VI-NEXT: buffer_load_ushort v1, off, s[12:15], 0
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_ushort v3, off, s[4:7], 0
; VI-NEXT: v_mov_b32_e32 v2, 0x3800
; VI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; VI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
; VI-NEXT: buffer_load_ushort v2, off, s[4:7], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: buffer_store_short v0, off, s[8:11], 0
; VI-NEXT: s_endpgm
half addrspace(1)* %r,
@ -474,25 +474,25 @@ define amdgpu_kernel void @select_v2f16(
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s18, s2
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_mov_b32 s16, s8
; VI-NEXT: s_mov_b32 s17, s9
; VI-NEXT: s_mov_b32 s16, s6
; VI-NEXT: s_mov_b32 s17, s7
; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_mov_b32 s20, s8
; VI-NEXT: s_mov_b32 s21, s9
; VI-NEXT: s_mov_b32 s8, s10
; VI-NEXT: s_mov_b32 s9, s11
; VI-NEXT: s_mov_b32 s19, s3
; VI-NEXT: s_mov_b32 s22, s2
; VI-NEXT: s_mov_b32 s23, s3
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v0, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v1, off, s[20:23], 0
; VI-NEXT: buffer_load_dword v2, off, s[12:15], 0
; VI-NEXT: buffer_load_dword v3, off, s[8:11], 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt vmcnt(3)
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; VI-NEXT: s_waitcnt vmcnt(2)
@ -534,15 +534,15 @@ define amdgpu_kernel void @select_v2f16_imm_a(
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
@ -580,22 +580,22 @@ define amdgpu_kernel void @select_v2f16_imm_a(
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_mov_b32 s0, s2
; VI-NEXT: s_mov_b32 s1, s3
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_mov_b32 s13, s5
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s2, s10
; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; VI-NEXT: s_movk_i32 s0, 0x3900
; VI-NEXT: s_movk_i32 s2, 0x3900
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-NEXT: v_cmp_lt_f16_e32 vcc, 0.5, v0
@ -603,7 +603,7 @@ define amdgpu_kernel void @select_v2f16_imm_a(
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; VI-NEXT: v_cmp_lt_f16_e32 vcc, s0, v3
; VI-NEXT: v_cmp_lt_f16_e32 vcc, s2, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@ -633,15 +633,15 @@ define amdgpu_kernel void @select_v2f16_imm_b(
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
@ -679,22 +679,22 @@ define amdgpu_kernel void @select_v2f16_imm_b(
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_mov_b32 s0, s2
; VI-NEXT: s_mov_b32 s1, s3
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_mov_b32 s13, s5
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s2, s10
; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; VI-NEXT: s_movk_i32 s0, 0x3900
; VI-NEXT: s_movk_i32 s2, 0x3900
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; VI-NEXT: v_cmp_gt_f16_e32 vcc, 0.5, v0
@ -702,7 +702,7 @@ define amdgpu_kernel void @select_v2f16_imm_b(
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; VI-NEXT: v_cmp_gt_f16_e32 vcc, s0, v3
; VI-NEXT: v_cmp_gt_f16_e32 vcc, s2, v3
; VI-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@ -731,38 +731,39 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: buffer_load_dword v3, off, s[16:19], 0
; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_cvt_f32_f16_e32 v4, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v5
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v6, vcc
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v3
; SI-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v4, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v1, vcc
; SI-NEXT: v_cndmask_b32_e32 v1, 0.5, v2, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
@ -777,32 +778,33 @@ define amdgpu_kernel void @select_v2f16_imm_c(
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_mov_b32 s0, s2
; VI-NEXT: s_mov_b32 s1, s3
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_mov_b32 s13, s5
; VI-NEXT: s_mov_b32 s2, s10
; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; VI-NEXT: buffer_load_dword v4, off, s[12:15], 0
; VI-NEXT: v_mov_b32_e32 v2, 0x3800
; VI-NEXT: v_mov_b32_e32 v3, 0x3900
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: v_mov_b32_e32 v4, 0x3900
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v1
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v0, v4
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; VI-NEXT: v_cmp_nlt_f16_e32 vcc, v6, v5
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0
@ -830,41 +832,41 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s16, s4
; SI-NEXT: s_mov_b32 s17, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: buffer_load_dword v3, off, s[16:19], 0
; SI-NEXT: v_mov_b32_e32 v2, 0x3f200000
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; SI-NEXT: v_mov_b32_e32 v3, 0x3f200000
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_waitcnt vmcnt(2)
; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v0
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v1
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v3
; SI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v6, v6
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v4, v5
; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3
; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v1, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; SI-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
; SI-NEXT: v_cndmask_b32_e32 v0, 0.5, v2, vcc
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v3
; SI-NEXT: v_or_b32_e32 v0, v0, v1
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_endpgm
@ -877,32 +879,33 @@ define amdgpu_kernel void @select_v2f16_imm_d(
; VI-NEXT: s_mov_b32 s14, s10
; VI-NEXT: s_mov_b32 s15, s11
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_mov_b32 s0, s2
; VI-NEXT: s_mov_b32 s1, s3
; VI-NEXT: s_mov_b32 s12, s4
; VI-NEXT: s_mov_b32 s13, s5
; VI-NEXT: s_mov_b32 s2, s10
; VI-NEXT: s_mov_b32 s3, s11
; VI-NEXT: s_mov_b32 s12, s2
; VI-NEXT: s_mov_b32 s13, s3
; VI-NEXT: s_mov_b32 s16, s4
; VI-NEXT: s_mov_b32 s17, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s18, s10
; VI-NEXT: s_mov_b32 s19, s11
; VI-NEXT: s_mov_b32 s6, s10
; VI-NEXT: s_mov_b32 s7, s11
; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; VI-NEXT: buffer_load_dword v4, off, s[12:15], 0
; VI-NEXT: v_mov_b32_e32 v2, 0x3800
; VI-NEXT: v_mov_b32_e32 v3, 0x3900
; VI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; VI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; VI-NEXT: buffer_load_dword v2, off, s[4:7], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x3800
; VI-NEXT: v_mov_b32_e32 v4, 0x3900
; VI-NEXT: s_mov_b32 s8, s0
; VI-NEXT: s_mov_b32 s9, s1
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v0, v4
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; VI-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; VI-NEXT: v_cmp_lt_f16_e32 vcc, v6, v5
; VI-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; VI-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0

View File

@ -54,8 +54,8 @@ define amdgpu_kernel void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)*
; after 64-bit shift is split.
; GCN-LABEL: {{^}}lshr_and_i64_35:
; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_load_dword v[[LO:[0-9]+]]
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]]
; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[LO]], 8, 23
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {

View File

@ -13,14 +13,14 @@ define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> add
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s10, s2
; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s8, s6
; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s7
; GCN-NEXT: s_mov_b32 s6, s2
; GCN-NEXT: s_mov_b32 s7, s3
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshl_b32_e32 v1, v1, v3
; GCN-NEXT: v_lshl_b32_e32 v0, v0, v2
@ -59,15 +59,15 @@ define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> add
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s10, s2
; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s8, s6
; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s7
; GCN-NEXT: s_mov_b32 s6, s2
; GCN-NEXT: s_mov_b32 s7, s3
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshl_b32_e32 v3, v3, v7
; GCN-NEXT: v_lshl_b32_e32 v2, v2, v6
@ -411,23 +411,23 @@ define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> add
; GCN-NEXT: s_mov_b32 s8, s6
; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GCN-NEXT: s_mov_b64 s[12:13], s[6:7]
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: s_mov_b32 s14, 0
; GCN-NEXT: s_mov_b32 s15, s3
; GCN-NEXT: s_mov_b64 s[12:13], s[6:7]
; GCN-NEXT: buffer_load_dword v2, off, s[8:11], 0
; GCN-NEXT: buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4
; GCN-NEXT: s_mov_b32 s6, 0xffff
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s4, 0xffff
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GCN-NEXT: v_and_b32_e32 v0, s4, v0
; GCN-NEXT: v_and_b32_e32 v0, s6, v0
; GCN-NEXT: v_lshl_b32_e32 v0, v2, v0
; GCN-NEXT: v_lshl_b32_e32 v1, v1, v3
; GCN-NEXT: v_and_b32_e32 v0, s4, v0
; GCN-NEXT: v_and_b32_e32 v0, s6, v0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0
@ -490,14 +490,14 @@ define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> add
; GCN-NEXT: s_mov_b64 s[0:1], s[6:7]
; GCN-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; GCN-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
; GCN-NEXT: s_mov_b32 s8, 0xffff
; GCN-NEXT: s_mov_b32 s0, 0xffff
; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_and_b32_e32 v8, s8, v4
; GCN-NEXT: v_and_b32_e32 v8, s0, v4
; GCN-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_and_b32_e32 v9, s8, v5
; GCN-NEXT: v_and_b32_e32 v9, s0, v5
; GCN-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; GCN-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_lshl_b32_e32 v5, v7, v5
@ -505,9 +505,9 @@ define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> add
; GCN-NEXT: v_lshl_b32_e32 v4, v6, v4
; GCN-NEXT: v_lshl_b32_e32 v2, v2, v8
; GCN-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GCN-NEXT: v_and_b32_e32 v3, s8, v3
; GCN-NEXT: v_and_b32_e32 v3, s0, v3
; GCN-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GCN-NEXT: v_and_b32_e32 v2, s8, v2
; GCN-NEXT: v_and_b32_e32 v2, s0, v2
; GCN-NEXT: v_or_b32_e32 v3, v3, v5
; GCN-NEXT: v_or_b32_e32 v2, v2, v4
; GCN-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
@ -732,17 +732,17 @@ define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> add
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_mov_b32 s10, s2
; GCN-NEXT: s_mov_b32 s11, s3
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s8, s6
; GCN-NEXT: s_mov_b32 s9, s7
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; GCN-NEXT: buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:48
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s7
; GCN-NEXT: s_mov_b32 s6, s2
; GCN-NEXT: s_mov_b32 s7, s3
; GCN-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GCN-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
; GCN-NEXT: buffer_load_dwordx4 v[11:14], off, s[4:7], 0 offset:48
; GCN-NEXT: s_waitcnt vmcnt(1)
; GCN-NEXT: v_lshl_b64 v[2:3], v[2:3], v10
; GCN-NEXT: s_waitcnt vmcnt(0)

View File

@ -86,23 +86,23 @@ define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> a
; VI-LABEL: v_shl_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v5, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 4, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[4:5]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v4, v1, v0
; VI-NEXT: v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v0, v4, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v5
; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_shl_v2i16:
@ -116,17 +116,17 @@ define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> a
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dword v3, v[0:1], s[0:3], 0 addr64 offset:4
; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_and_b32_e32 v5, s8, v3
; CI-NEXT: v_and_b32_e32 v5, s0, v3
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; CI-NEXT: v_lshl_b32_e32 v3, v4, v3
; CI-NEXT: v_lshl_b32_e32 v2, v2, v5
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_and_b32_e32 v2, s0, v2
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; CI-NEXT: s_endpgm
@ -170,39 +170,39 @@ define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, s0, v0
; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_lshlrev_b16_e32 v4, s0, v3
; VI-NEXT: v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v2, v4, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_v_s_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; CI-NEXT: s_load_dword s0, s[0:1], 0xd
; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_load_dword s8, s[0:1], 0xd
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, 0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s9, s0, 16
; CI-NEXT: s_and_b32 s10, s0, s8
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: s_lshr_b32 s1, s8, 16
; CI-NEXT: s_and_b32 s8, s8, s0
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; CI-NEXT: v_lshlrev_b32_e32 v2, s10, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, s9, v3
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_lshlrev_b32_e32 v2, s8, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, s1, v3
; CI-NEXT: v_and_b32_e32 v2, s0, v2
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v2, v2, v3
; CI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
@ -245,17 +245,17 @@ define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_mov_b32_e32 v2, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e64 v1, v0, s0
; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_lshlrev_b16_e64 v4, v3, s0
; VI-NEXT: v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v4, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_s_v_v2i16:
@ -270,12 +270,12 @@ define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16>
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: buffer_load_dword v2, v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: s_lshr_b32 s9, s8, 16
; CI-NEXT: s_lshr_b32 s1, s8, 16
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_and_b32_e32 v3, s0, v2
; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; CI-NEXT: v_lshl_b32_e32 v2, s9, v2
; CI-NEXT: v_lshl_b32_e32 v2, s1, v2
; CI-NEXT: v_lshl_b32_e32 v3, s8, v3
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_and_b32_e32 v3, s0, v3
@ -319,15 +319,15 @@ define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e64 v1, v0, 8
; VI-NEXT: v_lshlrev_b16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_lshlrev_b16_e64 v2, v3, 8
; VI-NEXT: v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_imm_v_v2i16:
@ -387,16 +387,16 @@ define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; VI-NEXT: v_and_b32_e32 v1, 0xff000000, v1
; VI-NEXT: v_lshlrev_b16_e32 v0, 8, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3
; VI-NEXT: v_and_b32_e32 v2, 0xff000000, v2
; VI-NEXT: v_lshlrev_b16_e32 v3, 8, v3
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; CI-LABEL: shl_v_imm_v2i16:
@ -429,45 +429,45 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
; GFX9-LABEL: v_shl_v4i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: v_mov_b32_e32 v5, s1
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v5
; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4
; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off
; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v3
; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v2
; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_shl_v4i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v0
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v0
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v6, v5, v1
; VI-NEXT: v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_lshlrev_b16_e32 v5, v4, v0
; VI-NEXT: v_lshlrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_lshlrev_b16_e32 v6, v3, v1
; VI-NEXT: v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_lshlrev_b16_e32 v3, v2, v0
; VI-NEXT: v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v1, v6, v1
; VI-NEXT: v_or_b32_e32 v0, v5, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: v_or_b32_e32 v0, v3, v0
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: s_endpgm
;
; CI-LABEL: v_shl_v4i16:
@ -481,14 +481,14 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: buffer_load_dwordx2 v[4:5], v[0:1], s[0:3], 0 addr64 offset:8
; CI-NEXT: s_mov_b32 s8, 0xffff
; CI-NEXT: s_mov_b32 s0, 0xffff
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(1)
; CI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_and_b32_e32 v8, s8, v4
; CI-NEXT: v_and_b32_e32 v8, s0, v4
; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; CI-NEXT: v_and_b32_e32 v9, s8, v5
; CI-NEXT: v_and_b32_e32 v9, s0, v5
; CI-NEXT: v_lshrrev_b32_e32 v7, 16, v3
; CI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; CI-NEXT: v_lshl_b32_e32 v5, v7, v5
@ -496,9 +496,9 @@ define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> a
; CI-NEXT: v_lshl_b32_e32 v4, v6, v4
; CI-NEXT: v_lshl_b32_e32 v2, v2, v8
; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; CI-NEXT: v_and_b32_e32 v3, s8, v3
; CI-NEXT: v_and_b32_e32 v3, s0, v3
; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; CI-NEXT: v_and_b32_e32 v2, s8, v2
; CI-NEXT: v_and_b32_e32 v2, s0, v2
; CI-NEXT: v_or_b32_e32 v3, v3, v5
; CI-NEXT: v_or_b32_e32 v2, v2, v4
; CI-NEXT: buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
@ -539,21 +539,21 @@ define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i1
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: s_mov_b32 s0, 0xff000000
; VI-NEXT: s_mov_b32 s2, 0xff000000
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v4, 8, v1
; VI-NEXT: v_lshlrev_b16_e32 v5, 8, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; VI-NEXT: v_and_b32_e32 v0, s0, v0
; VI-NEXT: v_and_b32_e32 v0, s2, v0
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v1
; VI-NEXT: v_and_b32_e32 v4, s0, v4
; VI-NEXT: v_and_b32_e32 v4, s2, v4
; VI-NEXT: v_or_b32_e32 v1, v1, v4
; VI-NEXT: v_or_b32_e32 v0, v5, v0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@ -569,14 +569,14 @@ define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
; CI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
; CI-NEXT: s_mov_b32 s8, 0xff00
; CI-NEXT: s_mov_b32 s0, 0xff00
; CI-NEXT: s_mov_b64 s[6:7], s[2:3]
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshrrev_b32_e32 v4, 8, v3
; CI-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; CI-NEXT: v_and_b32_e32 v4, s8, v4
; CI-NEXT: v_and_b32_e32 v4, s0, v4
; CI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
; CI-NEXT: v_and_b32_e32 v3, s8, v3
; CI-NEXT: v_and_b32_e32 v3, s0, v3
; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; CI-NEXT: v_or_b32_e32 v3, v3, v4
; CI-NEXT: v_and_b32_e32 v2, 0xff00ff00, v2

View File

@ -112,17 +112,17 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out,
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; VI-NEXT: v_subrev_u32_e32 v1, vcc, 64, v4
; VI-NEXT: v_subrev_u32_e32 v2, vcc, 64, v3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_subrev_u32_e32 v0, vcc, 64, v0
; VI-NEXT: flat_store_dword v[2:3], v1
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_subrev_u32_e32 v3, vcc, 64, v4
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_i32_x_sub_64_multi_use:
@ -133,17 +133,17 @@ define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out,
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: global_load_dword v4, v[0:1], off
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v4
; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v0
; GFX9-NEXT: global_store_dword v[2:3], v1, off
; GFX9-NEXT: global_store_dword v[2:3], v0, off
; GFX9-NEXT: v_subrev_u32_e32 v3, 64, v4
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: global_store_dword v[0:1], v3, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i32_x_sub_64_multi_use:
@ -945,17 +945,17 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out,
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ushort v3, v[0:1]
; VI-NEXT: flat_load_ushort v4, v[0:1]
; VI-NEXT: flat_load_ushort v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1)
; VI-NEXT: v_subrev_u16_e32 v1, 64, v4
; VI-NEXT: v_subrev_u16_e32 v2, 64, v3
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_subrev_u16_e32 v0, 64, v0
; VI-NEXT: flat_store_short v[2:3], v1
; VI-NEXT: flat_store_short v[2:3], v0
; VI-NEXT: v_subrev_u16_e32 v3, 64, v4
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: flat_store_short v[0:1], v3
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_i16_x_sub_64_multi_use:
@ -966,17 +966,17 @@ define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out,
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_load_ushort v3, v[0:1], off
; GFX9-NEXT: global_load_ushort v4, v[0:1], off
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v4
; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_subrev_u16_e32 v0, 64, v0
; GFX9-NEXT: global_store_short v[2:3], v1, off
; GFX9-NEXT: global_store_short v[2:3], v0, off
; GFX9-NEXT: v_subrev_u16_e32 v3, 64, v4
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: global_store_short v[0:1], v3, off
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: v_test_i16_x_sub_64_multi_use:
@ -1037,20 +1037,20 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(<2 x i16> addrspace(1)* %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_mov_b32_e32 v4, 64
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, 64
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_subrev_u16_e32 v0, 64, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_subrev_u16_e32 v3, 64, v3
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_sub_64_64:
@ -1125,15 +1125,15 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(<2 x i16> addrspace(1)* %out,
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v1, -7, v0
; VI-NEXT: v_sub_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_add_u16_e32 v2, -7, v3
; VI-NEXT: v_sub_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_sub_7_64:
@ -1204,20 +1204,20 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(<2 x i16> addrspace(1)* %ou
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_mov_b32_e32 v4, 0xffffff85
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, 0xffffff85
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_subrev_u16_e32 v0, 64, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_add_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_subrev_u16_e32 v3, 64, v3
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_sub_64_123:
@ -1292,15 +1292,15 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_0(<2 x i16> addrspace(1)* %out,
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_add_u16_e32 v0, -7, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; VI-NEXT: v_add_u16_e32 v3, -7, v3
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_sub_7_0:
@ -1608,20 +1608,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(<2 x i16> addrspace(1)
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: v_mov_b32_e32 v4, 32
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, 32
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_sub_u16_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_subrev_u16_e32 v3, 32, v3
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32:
@ -1772,15 +1772,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_0(<2 x i16> addrspace(1)* %o
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; VI-NEXT: v_subrev_u16_e32 v3, 32, v3
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg32_0:
@ -1856,15 +1856,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(<2 x i16> addrspace(1)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v1, -16, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_add_u16_e32 v2, -16, v3
; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16:
@ -2015,15 +2015,15 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_0(<2 x i16> addrspace(1)* %o
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_add_u16_e32 v0, -16, v0
; VI-NEXT: v_or_b32_e32 v0, v0, v1
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_and_b32_e32 v2, 0xffff0000, v3
; VI-NEXT: v_add_u16_e32 v3, -16, v3
; VI-NEXT: v_or_b32_e32 v2, v3, v2
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg16_0:
@ -2094,20 +2094,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(<2 x i16> addrspace(1)*
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_movk_i32 s2, 0xc400
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: s_movk_i32 s0, 0xc400
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v1, s0, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_add_u16_e32 v2, s2, v3
; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone:
@ -2179,20 +2179,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(<2 x i16> addrspace(1
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_movk_i32 s2, 0x4400
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: s_movk_i32 s0, 0x4400
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v1, s0, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_add_u16_e32 v2, s2, v3
; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone:
@ -2264,20 +2264,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(<2 x i16> addrspace(1)*
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_movk_i32 s2, 0x4000
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: s_movk_i32 s0, 0x4000
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v1, s0, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_add_u16_e32 v2, s2, v3
; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo:
@ -2349,20 +2349,20 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(<2 x i16> addrspace(1
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_movk_i32 s2, 0xc000
; VI-NEXT: v_mov_b32_e32 v4, s2
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: s_movk_i32 s0, 0xc000
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v1, s0, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: flat_store_dword v[2:3], v0
; VI-NEXT: v_add_u16_e32 v2, s2, v3
; VI-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v2, v3
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo:

View File

@ -399,14 +399,14 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addr
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s10, s2
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0
; SI-NEXT: v_bfe_i32 v2, v0, 16, 8
@ -423,14 +423,14 @@ define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addr
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s6
; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0
@ -523,14 +523,14 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 add
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s10, s2
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48
; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
@ -547,14 +547,14 @@ define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 add
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s8, s6
; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16

View File

@ -30,26 +30,24 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
; VI-LABEL: v_test_sub_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u16_e32 v2, v0, v1
; VI-NEXT: v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@ -88,14 +86,14 @@ define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_load_dword s4, s[6:7], 0x0
; VI-NEXT: s_load_dword s5, s[8:9], 0x0
; VI-NEXT: s_load_dword s6, s[8:9], 0x0
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s6, s4, 16
; VI-NEXT: s_lshr_b32 s7, s5, 16
; VI-NEXT: s_sub_i32 s4, s4, s5
; VI-NEXT: s_sub_i32 s5, s6, s7
; VI-NEXT: s_lshr_b32 s5, s4, 16
; VI-NEXT: s_lshr_b32 s7, s6, 16
; VI-NEXT: s_sub_i32 s4, s4, s6
; VI-NEXT: s_sub_i32 s5, s5, s7
; VI-NEXT: s_and_b32 s4, s4, 0xffff
; VI-NEXT: s_lshl_b32 s5, s5, 16
; VI-NEXT: s_or_b32 s4, s4, s5
@ -185,22 +183,20 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %ou
;
; VI-LABEL: v_test_sub_v2i16_constant:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 0xfffffe38
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mov_b32_e32 v1, 0xfffffe38
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v1, 0xffffff85, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_add_u16_e32 v2, 0xffffff85, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -235,22 +231,20 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)*
;
; VI-LABEL: v_test_sub_v2i16_neg_constant:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 0x3df
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mov_b32_e32 v1, 0x3df
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v1, 0x34d, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_add_u16_e32 v2, 0x34d, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -283,22 +277,20 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)*
;
; VI-LABEL: v_test_sub_v2i16_inline_neg1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: v_mov_b32_e32 v2, 1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: v_mov_b32_e32 v1, 1
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u16_e32 v1, 1, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_add_u16_e32 v2, 1, v0
; VI-NEXT: v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, v2, v0
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@ -331,17 +323,15 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
;
; VI-LABEL: v_test_sub_v2i16_inline_lo_zero_hi:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v0
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_subrev_u16_e32 v0, 32, v0
@ -411,50 +401,46 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v1, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u16_e32 v0, v1, v2
; VI-NEXT: v_sub_u16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
@ -473,54 +459,50 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)
; GFX9-LABEL: v_test_sub_v2i16_zext_to_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s8, v2
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[4:5], off
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: v_mov_b32_e32 v3, 0
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_zext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: v_mov_b32_e32 v3, v1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u16_e32 v0, v4, v2
; VI-NEXT: v_sub_u16_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
@ -539,52 +521,48 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)
; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0
; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_sub_u16_e32 v0, v0, v1
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; VI-NEXT: v_bfe_i32 v1, v2, 0, 16
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
@ -603,21 +581,19 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)
; GFX9-LABEL: v_test_sub_v2i16_sext_to_v2i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: v_mov_b32_e32 v3, s9
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: global_load_dword v1, v[2:3], off
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: s_mov_b32 s1, s5
; GFX9-NEXT: s_mov_b32 s7, 0xf000
; GFX9-NEXT: s_mov_b32 s6, -1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1
@ -625,27 +601,25 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)
; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GFX9-NEXT: s_endpgm
;
; VI-LABEL: v_test_sub_v2i16_sext_to_v2i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s7
; VI-NEXT: v_add_u32_e32 v0, vcc, s6, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s9
; VI-NEXT: v_add_u32_e32 v2, vcc, s8, v2
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dword v0, v[0:1]
; VI-NEXT: flat_load_dword v1, v[2:3]
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: v_sub_u16_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-NEXT: v_sub_u16_e32 v0, v0, v1
@ -653,7 +627,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)
; VI-NEXT: v_bfe_i32 v2, v2, 0, 16
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid

View File

@ -106,13 +106,13 @@ define amdgpu_kernel void @truncate_high_elt_extract_vector(<2 x i16> addrspace(
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[4:5], 0x0
; VI-NEXT: s_load_dword s3, s[6:7], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_load_dword s0, s[4:5], 0x0
; VI-NEXT: s_load_dword s1, s[6:7], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i16 s0, s0
; VI-NEXT: s_sext_i32_i16 s1, s1
; VI-NEXT: s_sext_i32_i16 s0, s2
; VI-NEXT: s_sext_i32_i16 s1, s3
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: v_mul_i32_i24_e32 v2, s1, v2
; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2

View File

@ -1824,46 +1824,46 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(i64 addrspace(1)* %out, i64 %
define amdgpu_kernel void @s_test_udiv24_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
; GCN-LABEL: s_test_udiv24_k_den_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s6, 0x46b6fe00
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_lshr_b32 s0, s7, 8
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s0
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: s_lshr_b32 s2, s3, 8
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2
; GCN-NEXT: s_mov_b32 s2, 0x46b6fe00
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1
; GCN-NEXT: v_mad_f32 v0, -v1, s6, v0
; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s6
; GCN-NEXT: v_mad_f32 v0, -v1, s2, v0
; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; GCN-IR-LABEL: s_test_udiv24_k_den_i64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
; GCN-IR-NEXT: s_mov_b32 s6, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_mov_b32 s6, 0x46b6fe00
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_lshr_b32 s0, s7, 8
; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s0
; GCN-IR-NEXT: s_mov_b32 s0, s4
; GCN-IR-NEXT: s_mov_b32 s1, s5
; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8
; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2
; GCN-IR-NEXT: s_mov_b32 s2, 0x46b6fe00
; GCN-IR-NEXT: s_mov_b32 s4, s0
; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0
; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1
; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1
; GCN-IR-NEXT: v_mad_f32 v0, -v1, s6, v0
; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s6
; GCN-IR-NEXT: v_mad_f32 v0, -v1, s2, v0
; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s2
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-IR-NEXT: s_endpgm
%x.shr = lshr i64 %x, 40
%result = udiv i64 %x.shr, 23423

View File

@ -1479,52 +1479,52 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(i64 addrspace(1)* %out, i64 %
define amdgpu_kernel void @s_test_urem24_k_den_i64(i64 addrspace(1)* %out, i64 %x) {
; GCN-LABEL: s_test_urem24_k_den_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s1, 0x46b6fe00
; GCN-NEXT: s_movk_i32 s0, 0x5b7f
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NEXT: s_mov_b32 s4, 0x46b6fe00
; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshr_b32 s6, s7, 8
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-NEXT: s_lshr_b32 s2, s3, 8
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2
; GCN-NEXT: s_movk_i32 s3, 0x5b7f
; GCN-NEXT: s_mov_b32 s5, s1
; GCN-NEXT: v_mul_f32_e32 v1, 0x38331158, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_cvt_u32_f32_e32 v2, v1
; GCN-NEXT: v_mad_f32 v0, -v1, s1, v0
; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s1
; GCN-NEXT: s_mov_b32 s1, s5
; GCN-NEXT: v_mad_f32 v0, -v1, s4, v0
; GCN-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
; GCN-NEXT: v_mul_lo_u32 v0, v0, s0
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: v_mul_lo_u32 v0, v0, s3
; GCN-NEXT: v_mov_b32_e32 v1, 0
; GCN-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
; GCN-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NEXT: s_endpgm
;
; GCN-IR-LABEL: s_test_urem24_k_den_i64:
; GCN-IR: ; %bb.0:
; GCN-IR-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-IR-NEXT: s_mov_b32 s1, 0x46b6fe00
; GCN-IR-NEXT: s_movk_i32 s0, 0x5b7f
; GCN-IR-NEXT: s_mov_b32 s3, 0xf000
; GCN-IR-NEXT: s_mov_b32 s2, -1
; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-IR-NEXT: s_mov_b32 s4, 0x46b6fe00
; GCN-IR-NEXT: s_mov_b32 s7, 0xf000
; GCN-IR-NEXT: s_mov_b32 s6, -1
; GCN-IR-NEXT: s_waitcnt lgkmcnt(0)
; GCN-IR-NEXT: s_lshr_b32 s6, s7, 8
; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-IR-NEXT: s_lshr_b32 s2, s3, 8
; GCN-IR-NEXT: v_cvt_f32_u32_e32 v0, s2
; GCN-IR-NEXT: s_movk_i32 s3, 0x5b7f
; GCN-IR-NEXT: s_mov_b32 s5, s1
; GCN-IR-NEXT: v_mul_f32_e32 v1, 0x38331158, v0
; GCN-IR-NEXT: v_trunc_f32_e32 v1, v1
; GCN-IR-NEXT: v_cvt_u32_f32_e32 v2, v1
; GCN-IR-NEXT: v_mad_f32 v0, -v1, s1, v0
; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s1
; GCN-IR-NEXT: s_mov_b32 s1, s5
; GCN-IR-NEXT: v_mad_f32 v0, -v1, s4, v0
; GCN-IR-NEXT: v_cmp_ge_f32_e64 vcc, |v0|, s4
; GCN-IR-NEXT: s_mov_b32 s4, s0
; GCN-IR-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc
; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s0
; GCN-IR-NEXT: s_mov_b32 s0, s4
; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, s3
; GCN-IR-NEXT: v_mov_b32_e32 v1, 0
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s6, v0
; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0
; GCN-IR-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-IR-NEXT: s_endpgm
%x.shr = lshr i64 %x, 40
%result = urem i64 %x.shr, 23423

View File

@ -80,12 +80,12 @@ define amdgpu_kernel void @madak_f16_use_2(
; SI-NEXT: s_mov_b32 s9, s11
; SI-NEXT: s_mov_b32 s10, s2
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; SI-NEXT: s_mov_b32 s14, s2
; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: buffer_load_ushort v3, off, s[12:15], 0
; SI-NEXT: v_mov_b32_e32 v2, 0x41200000
; SI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
; SI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; SI-NEXT: buffer_load_ushort v2, off, s[12:15], 0
; SI-NEXT: v_mov_b32_e32 v3, 0x41200000
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s8, s6
@ -95,11 +95,11 @@ define amdgpu_kernel void @madak_f16_use_2(
; SI-NEXT: s_waitcnt vmcnt(1)
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_madak_f32 v1, v0, v1, 0x41200000
; SI-NEXT: v_mac_f32_e32 v2, v0, v3
; SI-NEXT: v_mac_f32_e32 v3, v0, v2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
; SI-NEXT: v_cvt_f16_f32_e32 v1, v3
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: buffer_store_short v1, off, s[8:11], 0
; SI-NEXT: s_endpgm
@ -119,24 +119,22 @@ define amdgpu_kernel void @madak_f16_use_2(
; VI-NEXT: s_mov_b32 s9, s11
; VI-NEXT: s_mov_b32 s10, s2
; VI-NEXT: s_mov_b32 s11, s3
; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; VI-NEXT: s_mov_b32 s14, s2
; VI-NEXT: s_mov_b32 s15, s3
; VI-NEXT: buffer_load_ushort v3, off, s[12:15], 0
; VI-NEXT: v_mov_b32_e32 v2, 0x4900
; VI-NEXT: buffer_load_ushort v0, off, s[16:19], 0
; VI-NEXT: buffer_load_ushort v1, off, s[8:11], 0
; VI-NEXT: buffer_load_ushort v2, off, s[12:15], 0
; VI-NEXT: v_mov_b32_e32 v3, 0x4900
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: s_mov_b32 s8, s6
; VI-NEXT: s_mov_b32 s9, s7
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_madak_f16 v1, v0, v1, 0x4900
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mac_f16_e32 v2, v0, v3
; VI-NEXT: v_mac_f16_e32 v3, v0, v2
; VI-NEXT: buffer_store_short v1, off, s[0:3], 0
; VI-NEXT: buffer_store_short v2, off, s[4:7], 0
; VI-NEXT: buffer_store_short v3, off, s[8:11], 0
; VI-NEXT: s_endpgm
half addrspace(1)* %r0,
half addrspace(1)* %r1,

View File

@ -36,33 +36,33 @@ define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
; GCN-NEXT: v_mov_b32_e32 v2, 0
; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: s_mov_b32 s2, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_mov_b64 s[0:1], s[6:7]
; GCN-NEXT: v_lshlrev_b32_e32 v1, 4, v0
; GCN-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; GCN-NEXT: v_mov_b32_e32 v5, v2
; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 addr64
; GCN-NEXT: v_mov_b32_e32 v6, s8
; GCN-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GCN-NEXT: buffer_load_dwordx4 v[1:4], v[4:5], s[0:3], 0 addr64
; GCN-NEXT: v_lshlrev_b32_e32 v6, 2, v0
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 3
; GCN-NEXT: v_mov_b32_e32 v7, v5
; GCN-NEXT: s_mov_b64 s[6:7], s[2:3]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc
; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 2
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc
; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 1
; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 1
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 2
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 3
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 2
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GCN-NEXT: buffer_store_dword v0, v[4:5], s[4:7], 0 addr64
; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s10, 3
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; GCN-NEXT: buffer_store_dword v0, v[6:7], s[4:7], 0 addr64
; GCN-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%id.ext = sext i32 %id to i64

View File

@ -108,12 +108,12 @@ define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v2, v[2:3], off
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
@ -128,11 +128,11 @@ define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half>
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]

View File

@ -6,17 +6,17 @@ define amdgpu_kernel void @widen_i16_constant_load(i16 addrspace(4)* %arg) {
; SI-LABEL: widen_i16_constant_load:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s5, s4
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_load_dword s1, s[0:1], 0x0
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_addk_i32 s0, 0x3e7
; SI-NEXT: s_or_b32 s0, s0, 4
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_addk_i32 s1, 0x3e7
; SI-NEXT: s_or_b32 s4, s1, 4
; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i16_constant_load:
@ -43,18 +43,18 @@ define amdgpu_kernel void @widen_i16_constant_load_zext_i32(i16 addrspace(4)* %a
; SI-LABEL: widen_i16_constant_load_zext_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s5, s4
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_load_dword s1, s[0:1], 0x0
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s0, s0, 0xffff
; SI-NEXT: s_addk_i32 s0, 0x3e7
; SI-NEXT: s_or_b32 s0, s0, 4
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_and_b32 s1, s1, 0xffff
; SI-NEXT: s_addk_i32 s1, 0x3e7
; SI-NEXT: s_or_b32 s4, s1, 4
; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i16_constant_load_zext_i32:
@ -83,18 +83,18 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(i16 addrspace(4)* %a
; SI-LABEL: widen_i16_constant_load_sext_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s5, s4
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_load_dword s1, s[0:1], 0x0
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_sext_i32_i16 s0, s0
; SI-NEXT: s_addk_i32 s0, 0x3e7
; SI-NEXT: s_or_b32 s0, s0, 4
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_sext_i32_i16 s1, s1
; SI-NEXT: s_addk_i32 s1, 0x3e7
; SI-NEXT: s_or_b32 s4, s1, 4
; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i16_constant_load_sext_i32:
@ -122,13 +122,13 @@ define amdgpu_kernel void @widen_i16_constant_load_sext_i32(i16 addrspace(4)* %a
define amdgpu_kernel void @widen_i17_constant_load(i17 addrspace(4)* %arg) {
; SI-LABEL: widen_i17_constant_load:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s7, s[8:9], 0x0
; SI-NEXT: s_load_dword s7, s[6:7], 0x0
; SI-NEXT: s_mov_b32 s4, 2
; SI-NEXT: s_mov_b32 s5, s0
; SI-NEXT: s_mov_b32 s6, s2
@ -206,23 +206,23 @@ define amdgpu_kernel void @widen_f16_constant_load(half addrspace(4)* %arg) {
define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg) {
; SI-LABEL: widen_v2i8_constant_load:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s5, s4
; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_load_dword s1, s[2:3], 0x0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s1, s0, 0xff00
; SI-NEXT: s_add_i32 s0, s0, 12
; SI-NEXT: s_or_b32 s0, s0, 4
; SI-NEXT: s_and_b32 s0, s0, 0xff
; SI-NEXT: s_or_b32 s0, s1, s0
; SI-NEXT: s_addk_i32 s0, 0x2c00
; SI-NEXT: s_or_b32 s0, s0, 0x300
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_and_b32 s4, s1, 0xff00
; SI-NEXT: s_add_i32 s1, s1, 12
; SI-NEXT: s_or_b32 s1, s1, 4
; SI-NEXT: s_and_b32 s1, s1, 0xff
; SI-NEXT: s_or_b32 s1, s4, s1
; SI-NEXT: s_addk_i32 s1, 0x2c00
; SI-NEXT: s_or_b32 s4, s1, 0x300
; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_v2i8_constant_load:
@ -302,16 +302,16 @@ define amdgpu_kernel void @widen_i1_constant_load(i1 addrspace(4)* %arg) {
; SI-LABEL: widen_i1_constant_load:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s5, s4
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_load_dword s1, s[0:1], 0x0
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s0, s0, 1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_and_b32 s4, s1, 1
; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i1_constant_load:
@ -336,18 +336,18 @@ define amdgpu_kernel void @widen_i16_zextload_i64_constant_load(i16 addrspace(4)
; SI-LABEL: widen_i16_zextload_i64_constant_load:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s5, s4
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_load_dword s1, s[0:1], 0x0
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s0, s0, 0xffff
; SI-NEXT: s_addk_i32 s0, 0x3e7
; SI-NEXT: s_or_b32 s0, s0, 4
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_and_b32 s1, s1, 0xffff
; SI-NEXT: s_addk_i32 s1, 0x3e7
; SI-NEXT: s_or_b32 s4, s1, 4
; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i16_zextload_i64_constant_load:
@ -376,19 +376,19 @@ define amdgpu_kernel void @widen_i1_zext_to_i64_constant_load(i1 addrspace(4)* %
; SI-LABEL: widen_i1_zext_to_i64_constant_load:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s5, s4
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_load_dword s1, s[0:1], 0x0
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s0, s0, 1
; SI-NEXT: s_add_u32 s0, s0, 0x3e7
; SI-NEXT: s_addc_u32 s1, 0, 0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_and_b32 s1, s1, 1
; SI-NEXT: s_add_u32 s4, s1, 0x3e7
; SI-NEXT: s_addc_u32 s5, 0, 0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i1_zext_to_i64_constant_load:
@ -455,17 +455,17 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(i16 addrspace(1)* %ar
; SI-LABEL: widen_i16_global_invariant_load:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s4, 0
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_mov_b32 s5, s4
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_load_dword s1, s[0:1], 0x0
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_addk_i32 s0, 0x3e7
; SI-NEXT: s_or_b32 s0, s0, 1
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_addk_i32 s1, 0x3e7
; SI-NEXT: s_or_b32 s4, s1, 1
; SI-NEXT: s_mov_b32 s1, s0
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: widen_i16_global_invariant_load:

View File

@ -26,7 +26,7 @@ entry:
ret void
; CHECK-LABEL: test2
; CHECK: addi 3, 3, 8
; CHECK: lxvx [[LD:[0-9]+]], 0, 3
; CHECK: addi [[REG:[0-9]+]], 4, 4
; CHECK: lxvx [[LD:[0-9]+]], 0, 3
; CHECK: stxvx [[LD]], 0, [[REG]]
}

View File

@ -0,0 +1,92 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -run-pass=machine-scheduler -o - %s | FileCheck %s
---
# Check that machine-scheduler's BotHeightReduce heuristic puts the LD 8 in
# between the final run of MULLDs and the LDXs that feed them, to try to hide
# the latency of the LDXs.
name: test
tracksRegLiveness: true
body: |
; CHECK-LABEL: name: test
; CHECK: bb.0:
; CHECK: successors: %bb.1(0x80000000)
; CHECK: liveins: $x3, $x4
; CHECK: [[COPY:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY $x4
; CHECK: [[COPY1:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY $x3
; CHECK: [[ADDI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = ADDI8 [[COPY1]], 1
; CHECK: [[CMPLDI:%[0-9]+]]:crrc = CMPLDI [[COPY]], 1
; CHECK: [[LI8_:%[0-9]+]]:g8rc_and_g8rc_nox0 = LI8 1
; CHECK: [[ISEL8_:%[0-9]+]]:g8rc = ISEL8 [[COPY]], [[LI8_]], [[CMPLDI]].sub_gt
; CHECK: MTCTR8loop [[ISEL8_]], implicit-def dead $ctr8
; CHECK: [[LI8_1:%[0-9]+]]:g8rc = LI8 0
; CHECK: [[LI8_2:%[0-9]+]]:g8rc = LI8 2
; CHECK: [[LI8_3:%[0-9]+]]:g8rc = LI8 3
; CHECK: [[LI8_4:%[0-9]+]]:g8rc = LI8 5
; CHECK: [[LI8_5:%[0-9]+]]:g8rc = LI8 6
; CHECK: [[LI8_6:%[0-9]+]]:g8rc = LI8 7
; CHECK: bb.1:
; CHECK: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK: [[ADDI8_1:%[0-9]+]]:g8rc = ADDI8 [[ADDI8_]], 1
; CHECK: [[LD:%[0-9]+]]:g8rc = LD 0, [[ADDI8_]] :: (load 8)
; CHECK: [[LDX:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_]] :: (load 8)
; CHECK: [[LDX1:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_3]] :: (load 8)
; CHECK: [[LD1:%[0-9]+]]:g8rc = LD 4, [[ADDI8_]] :: (load 8)
; CHECK: [[LDX2:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_4]] :: (load 8)
; CHECK: [[LDX3:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_5]] :: (load 8)
; CHECK: [[LDX4:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_6]] :: (load 8)
; CHECK: [[LDX5:%[0-9]+]]:g8rc = LDX [[ADDI8_]], [[LI8_2]] :: (load 8)
; CHECK: [[MULLD:%[0-9]+]]:g8rc = MULLD [[LDX]], [[LD]]
; CHECK: [[LD2:%[0-9]+]]:g8rc = LD 8, [[ADDI8_]] :: (load 8)
; CHECK: [[MULLD1:%[0-9]+]]:g8rc = MULLD [[MULLD]], [[LDX5]]
; CHECK: [[MULLD2:%[0-9]+]]:g8rc = MULLD [[MULLD1]], [[LDX1]]
; CHECK: [[MULLD3:%[0-9]+]]:g8rc = MULLD [[MULLD2]], [[LD1]]
; CHECK: [[MULLD4:%[0-9]+]]:g8rc = MULLD [[MULLD3]], [[LDX2]]
; CHECK: [[MULLD5:%[0-9]+]]:g8rc = MULLD [[MULLD4]], [[LDX3]]
; CHECK: [[MULLD6:%[0-9]+]]:g8rc = MULLD [[MULLD5]], [[LDX4]]
; CHECK: [[MADDLD8_:%[0-9]+]]:g8rc = MADDLD8 [[MULLD6]], [[LD2]], [[MADDLD8_]]
; CHECK: [[COPY2:%[0-9]+]]:g8rc_and_g8rc_nox0 = COPY [[ADDI8_1]]
; CHECK: BDNZ8 %bb.1, implicit-def dead $ctr8, implicit $ctr8
; CHECK: B %bb.2
; CHECK: bb.2:
bb.0:
liveins: $x3, $x4
%0:g8rc_and_g8rc_nox0 = COPY $x4
%1:g8rc_and_g8rc_nox0 = COPY $x3
%2:g8rc_and_g8rc_nox0 = ADDI8 %1, 1
%3:crrc = CMPLDI %0, 1
%4:g8rc_and_g8rc_nox0 = LI8 1
%5:g8rc = ISEL8 %0, %4, %3.sub_gt
MTCTR8loop %5, implicit-def dead $ctr8
%6:g8rc = LI8 0
%7:g8rc = LI8 2
%8:g8rc = LI8 3
%9:g8rc = LI8 5
%10:g8rc = LI8 6
%11:g8rc = LI8 7
bb.1:
%12:g8rc = ADDI8 %2, 1
%13:g8rc = LD 0, %2 :: (load 8)
%14:g8rc = LDX %2, %4 :: (load 8)
%16:g8rc = LDX %2, %8 :: (load 8)
%17:g8rc = LD 4, %2 :: (load 8)
%18:g8rc = LDX %2, %9 :: (load 8)
%19:g8rc = LDX %2, %10 :: (load 8)
%20:g8rc = LDX %2, %11 :: (load 8)
%21:g8rc = LD 8, %2 :: (load 8)
%22:g8rc = MULLD %14, %13
%15:g8rc = LDX %2, %7 :: (load 8)
%23:g8rc = MULLD %22, %15
%24:g8rc = MULLD %23, %16
%25:g8rc = MULLD %24, %17
%26:g8rc = MULLD %25, %18
%27:g8rc = MULLD %26, %19
%28:g8rc = MULLD %27, %20
%6:g8rc = MADDLD8 %28, %21, %6
%2:g8rc_and_g8rc_nox0 = COPY %12
BDNZ8 %bb.1, implicit-def dead $ctr8, implicit $ctr8
B %bb.2
bb.2:
...

View File

@ -417,11 +417,11 @@ define dso_local <16 x i8> @no_RAUW_in_combine_during_legalize(i32* nocapture re
; CHECK-P9-LABEL: no_RAUW_in_combine_during_legalize:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: sldi r4, r4, 2
; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: lxsiwzx v2, r3, r4
; CHECK-P9-NEXT: addis r3, r2, .LCPI16_0@toc@ha
; CHECK-P9-NEXT: addi r3, r3, .LCPI16_0@toc@l
; CHECK-P9-NEXT: lxvx v3, 0, r3
; CHECK-P9-NEXT: xxlxor v4, v4, v4
; CHECK-P9-NEXT: vperm v2, v4, v2, v3
; CHECK-P9-NEXT: blr
;

View File

@ -5,18 +5,18 @@ define dso_local i64 @test1(i8* nocapture readonly %p, i32 signext %count) local
; CHECK-LABEL: test1:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: li 5, -13
; CHECK-NEXT: lxvx 0, 3, 5
; CHECK-NEXT: li 5, 19
; CHECK-NEXT: lxvx 1, 3, 5
; CHECK-NEXT: li 5, 3
; CHECK-NEXT: li 6, 7
; CHECK-NEXT: li 7, 11
; CHECK-NEXT: li 8, 15
; CHECK-NEXT: mfvsrld 9, 0
; CHECK-NEXT: ldx 5, 3, 5
; CHECK-NEXT: lxvx 0, 3, 5
; CHECK-NEXT: li 5, 19
; CHECK-NEXT: ldx 6, 3, 6
; CHECK-NEXT: ldx 7, 3, 7
; CHECK-NEXT: lxvx 1, 3, 5
; CHECK-NEXT: li 5, 3
; CHECK-NEXT: ldx 5, 3, 5
; CHECK-NEXT: ldx 3, 3, 8
; CHECK-NEXT: mfvsrld 9, 0
; CHECK-NEXT: mffprd 8, 0
; CHECK-NEXT: mfvsrld 10, 1
; CHECK-NEXT: mffprd 11, 1

View File

@ -508,9 +508,9 @@ define dso_local void @test_consecutive_i32(<4 x i32> %a, i32* nocapture %b) loc
; CHECK-P9-BE-LABEL: test_consecutive_i32:
; CHECK-P9-BE: # %bb.0: # %entry
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 3
; CHECK-P9-BE-NEXT: li r3, 4
; CHECK-P9-BE-NEXT: stfiwx f0, 0, r5
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1
; CHECK-P9-BE-NEXT: li r3, 4
; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3
; CHECK-P9-BE-NEXT: blr
entry:
@ -544,9 +544,9 @@ define dso_local void @test_consecutive_float(<4 x float> %a, float* nocapture %
; CHECK-P9-LABEL: test_consecutive_float:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1
; CHECK-P9-NEXT: li r3, 4
; CHECK-P9-NEXT: stfiwx f0, 0, r5
; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 3
; CHECK-P9-NEXT: li r3, 4
; CHECK-P9-NEXT: stfiwx f0, r5, r3
; CHECK-P9-NEXT: blr
;
@ -597,9 +597,9 @@ define dso_local void @test_stores_exceed_vec_size(<4 x i32> %a, i32* nocapture
; CHECK-P9-LABEL: test_stores_exceed_vec_size:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis r3, r2, .LCPI16_0@toc@ha
; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1
; CHECK-P9-NEXT: addi r3, r3, .LCPI16_0@toc@l
; CHECK-P9-NEXT: lxvx vs35, 0, r3
; CHECK-P9-NEXT: xxsldwi vs0, vs34, vs34, 1
; CHECK-P9-NEXT: li r3, 16
; CHECK-P9-NEXT: stfiwx f0, r5, r3
; CHECK-P9-NEXT: li r3, 20
@ -611,10 +611,10 @@ define dso_local void @test_stores_exceed_vec_size(<4 x i32> %a, i32* nocapture
; CHECK-P9-BE-LABEL: test_stores_exceed_vec_size:
; CHECK-P9-BE: # %bb.0: # %entry
; CHECK-P9-BE-NEXT: xxspltw vs0, vs34, 0
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs0, 2
; CHECK-P9-BE-NEXT: li r3, 16
; CHECK-P9-BE-NEXT: stxsiwx vs34, r5, r3
; CHECK-P9-BE-NEXT: li r3, 20
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs0, 2
; CHECK-P9-BE-NEXT: stxv vs0, 0(r5)
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 1
; CHECK-P9-BE-NEXT: stfiwx f0, r5, r3
@ -676,9 +676,9 @@ define void @test_5_consecutive_stores_of_bytes(<16 x i8> %a, i8* nocapture %b)
; CHECK-P9-LABEL: test_5_consecutive_stores_of_bytes:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: vsldoi v3, v2, v2, 4
; CHECK-P9-NEXT: li r3, 1
; CHECK-P9-NEXT: stxsibx vs35, 0, r5
; CHECK-P9-NEXT: vsldoi v3, v2, v2, 12
; CHECK-P9-NEXT: li r3, 1
; CHECK-P9-NEXT: stxsibx vs35, r5, r3
; CHECK-P9-NEXT: vsldoi v3, v2, v2, 15
; CHECK-P9-NEXT: li r3, 2
@ -694,9 +694,9 @@ define void @test_5_consecutive_stores_of_bytes(<16 x i8> %a, i8* nocapture %b)
; CHECK-P9-BE-LABEL: test_5_consecutive_stores_of_bytes:
; CHECK-P9-BE: # %bb.0: # %entry
; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 13
; CHECK-P9-BE-NEXT: li r3, 1
; CHECK-P9-BE-NEXT: stxsibx vs35, 0, r5
; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 5
; CHECK-P9-BE-NEXT: li r3, 1
; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3
; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 2
; CHECK-P9-BE-NEXT: li r3, 2
@ -807,9 +807,9 @@ define void @test_13_consecutive_stores_of_bytes(<16 x i8> %a, i8* nocapture %b)
; CHECK-P9-NEXT: li r3, 4
; CHECK-P9-NEXT: stxsibx vs35, r5, r3
; CHECK-P9-NEXT: vsldoi v3, v2, v2, 4
; CHECK-P9-NEXT: li r3, 5
; CHECK-P9-NEXT: stxsibx vs35, 0, r5
; CHECK-P9-NEXT: vsldoi v3, v2, v2, 8
; CHECK-P9-NEXT: li r3, 5
; CHECK-P9-NEXT: stxsibx vs35, r5, r3
; CHECK-P9-NEXT: vsldoi v3, v2, v2, 13
; CHECK-P9-NEXT: li r3, 6
@ -848,9 +848,9 @@ define void @test_13_consecutive_stores_of_bytes(<16 x i8> %a, i8* nocapture %b)
; CHECK-P9-BE-NEXT: li r3, 4
; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3
; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 13
; CHECK-P9-BE-NEXT: li r3, 5
; CHECK-P9-BE-NEXT: stxsibx vs35, 0, r5
; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 9
; CHECK-P9-BE-NEXT: li r3, 5
; CHECK-P9-BE-NEXT: stxsibx vs35, r5, r3
; CHECK-P9-BE-NEXT: vsldoi v3, v2, v2, 4
; CHECK-P9-BE-NEXT: li r3, 6
@ -947,8 +947,8 @@ define void @test_elements_from_two_vec(<4 x i32> %a, <4 x i32> %b, i32* nocaptu
; CHECK-P9-BE: # %bb.0: # %entry
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 3
; CHECK-P9-BE-NEXT: li r3, 4
; CHECK-P9-BE-NEXT: stfiwx f0, r7, r3
; CHECK-P9-BE-NEXT: stxsiwx vs35, 0, r7
; CHECK-P9-BE-NEXT: stfiwx f0, r7, r3
; CHECK-P9-BE-NEXT: blr
entry:
%vecext = extractelement <4 x i32> %a, i32 0
@ -996,9 +996,9 @@ define dso_local void @test_elements_from_three_vec(<4 x float> %a, <4 x float>
; CHECK-P9-BE-LABEL: test_elements_from_three_vec:
; CHECK-P9-BE: # %bb.0: # %entry
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs34, vs34, 2
; CHECK-P9-BE-NEXT: li r3, 4
; CHECK-P9-BE-NEXT: stfiwx f0, 0, r9
; CHECK-P9-BE-NEXT: xxsldwi vs0, vs35, vs35, 1
; CHECK-P9-BE-NEXT: li r3, 4
; CHECK-P9-BE-NEXT: stfiwx f0, r9, r3
; CHECK-P9-BE-NEXT: li r3, 8
; CHECK-P9-BE-NEXT: stxsiwx vs36, r9, r3

View File

@ -228,8 +228,8 @@ define fp128 @testMixedAggregate_03([4 x i128] %sa.coerce) {
; CHECK-LABEL: testMixedAggregate_03:
; CHECK: # %bb.0: # %entry
; CHECK: mtvsrwa v2, r3
; CHECK: xscvsdqp v2, v2
; CHECK: mtvsrdd v3, r6, r5
; CHECK-DAG: xscvsdqp v2, v2
; CHECK-DAG: mtvsrdd v3, r6, r5
; CHECK: xsaddqp v2, v3, v2
; CHECK: mtvsrd v[[REG1:[0-9]+]], r10
; CHECK: xscvsdqp v[[REG:[0-9]+]], v[[REG1]]
@ -350,12 +350,12 @@ define fp128 @sum_float128(i32 signext %count, ...) {
; CHECK-NEXT: bltlr cr0
; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: addi r3, r1, 40
; CHECK-NEXT: addi [[REG2:r[0-9]+]], r1, 72
; CHECK-NEXT: lxvx v3, 0, r3
; CHECK-NEXT: std [[REG2]], -8(r1)
; CHECK-NEXT: xsaddqp v2, v3, v2
; CHECK-NEXT: lxv v3, 16(r3)
; CHECK-NEXT: xsaddqp v2, v2, v3
; CHECK-NEXT: addi [[REG2:r[0-9]+]], r1, 72
; CHECK-NEXT: std [[REG2]], -8(r1)
; CHECK-NEXT: blr
entry:
%ap = alloca i8*, align 8

View File

@ -444,10 +444,10 @@ define void @qpConv2dp_03(double* nocapture %res, i32 signext %idx) {
; CHECK-LABEL: qpConv2dp_03:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addis r5, r2, .LC7@toc@ha
; CHECK-NEXT: sldi r4, r4, 3
; CHECK-NEXT: ld r5, .LC7@toc@l(r5)
; CHECK-NEXT: lxvx v2, 0, r5
; CHECK-NEXT: xscvqpdp v2, v2
; CHECK-NEXT: sldi r4, r4, 3
; CHECK-NEXT: stxsdx v2, r3, r4
; CHECK-NEXT: blr
entry:
@ -517,11 +517,11 @@ define void @qpConv2sp_03(float* nocapture %res, i32 signext %idx) {
; CHECK-LABEL: qpConv2sp_03:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addis r5, r2, .LC7@toc@ha
; CHECK-NEXT: sldi r4, r4, 2
; CHECK-NEXT: ld r5, .LC7@toc@l(r5)
; CHECK-NEXT: lxv v2, 48(r5)
; CHECK-NEXT: xscvqpdpo v2, v2
; CHECK-NEXT: xsrsp f0, v2
; CHECK-NEXT: sldi r4, r4, 2
; CHECK-NEXT: stfsx f0, r3, r4
; CHECK-NEXT: blr
entry:

View File

@ -153,13 +153,13 @@ define fp128 @mixParam_02(fp128 %p1, double %p2, i64* nocapture %p3,
; CHECK: # %bb.0: # %entry
; CHECK: lwz r3, 96(r1)
; CHECK: add r4, r7, r9
; CHECK: xscpsgndp v[[REG0:[0-9]+]], f1, f1
; CHECK: add r4, r4, r10
; CHECK: xscvdpqp v[[REG0]], v[[REG0]]
; CHECK: add r3, r4, r3
; CHECK: clrldi r3, r3, 32
; CHECK: std r3, 0(r6)
; CHECK: lxv v[[REG1:[0-9]+]], 0(r8)
; CHECK: xscpsgndp v[[REG0:[0-9]+]], f1, f1
; CHECK: xscvdpqp v[[REG0]], v[[REG0]]
; CHECK: xsaddqp v2, v[[REG1]], v2
; CHECK: xsaddqp v2, v2, v3
; CHECK-NEXT: blr
@ -185,13 +185,13 @@ define fastcc fp128 @mixParam_02f(fp128 %p1, double %p2, i64* nocapture %p3,
; CHECK-LABEL: mixParam_02f:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: add r4, r4, r6
; CHECK-NEXT: xscpsgndp v[[REG0:[0-9]+]], f1, f1
; CHECK-NEXT: add r4, r4, r7
; CHECK-NEXT: xscvdpqp v[[REG0]], v[[REG0]]
; CHECK-NEXT: add r4, r4, r8
; CHECK-NEXT: clrldi r4, r4, 32
; CHECK-DAG: std r4, 0(r3)
; CHECK-DAG: lxv v[[REG1:[0-9]+]], 0(r5)
; CHECK-NEXT: xscpsgndp v[[REG0:[0-9]+]], f1, f1
; CHECK-NEXT: xscvdpqp v[[REG0]], v[[REG0]]
; CHECK-NEXT: xsaddqp v2, v[[REG1]], v2
; CHECK-NEXT: xsaddqp v2, v2, v[[REG0]]
; CHECK-NEXT: blr

View File

@ -32,10 +32,19 @@ define signext i32 @test() nounwind {
; CHECK-NEXT: std 0, 16(1)
; CHECK-NEXT: stdu 1, -192(1)
; CHECK-NEXT: addis 3, 2, a1@toc@ha
; CHECK-NEXT: addis 5, 2, a16@toc@ha
; CHECK-NEXT: addis 6, 2, a17@toc@ha
; CHECK-NEXT: addis 4, 2, a15@toc@ha
; CHECK-NEXT: lfd 1, a1@toc@l(3)
; CHECK-NEXT: addis 3, 2, a2@toc@ha
; CHECK-NEXT: addi 5, 5, a16@toc@l
; CHECK-NEXT: addi 6, 6, a17@toc@l
; CHECK-NEXT: ld 4, a15@toc@l(4)
; CHECK-NEXT: lfd 2, a2@toc@l(3)
; CHECK-NEXT: addis 3, 2, a3@toc@ha
; CHECK-NEXT: lxvx 34, 0, 6
; CHECK-NEXT: lxvx 0, 0, 5
; CHECK-NEXT: li 5, 152
; CHECK-NEXT: lfd 3, a3@toc@l(3)
; CHECK-NEXT: addis 3, 2, a4@toc@ha
; CHECK-NEXT: lfd 4, a4@toc@l(3)
@ -54,17 +63,8 @@ define signext i32 @test() nounwind {
; CHECK-NEXT: addis 3, 2, a11@toc@ha
; CHECK-NEXT: lfd 11, a11@toc@l(3)
; CHECK-NEXT: addis 3, 2, a12@toc@ha
; CHECK-NEXT: addis 5, 2, a16@toc@ha
; CHECK-NEXT: addis 6, 2, a17@toc@ha
; CHECK-NEXT: addi 6, 6, a17@toc@l
; CHECK-NEXT: lxvx 34, 0, 6
; CHECK-NEXT: lfd 12, a12@toc@l(3)
; CHECK-NEXT: addis 3, 2, a13@toc@ha
; CHECK-NEXT: addi 5, 5, a16@toc@l
; CHECK-NEXT: addis 4, 2, a15@toc@ha
; CHECK-NEXT: lxvx 0, 0, 5
; CHECK-NEXT: ld 4, a15@toc@l(4)
; CHECK-NEXT: li 5, 152
; CHECK-NEXT: lfd 13, a13@toc@l(3)
; CHECK-NEXT: addis 3, 2, a14@toc@ha
; CHECK-NEXT: ld 3, a14@toc@l(3)

View File

@ -697,10 +697,10 @@ define <4 x float> @test_extend32_vec4(<4 x half>* %p) #0 {
; CHECK-NEXT: lhz r3, 0(r3)
; CHECK-NEXT: xxmrghd vs0, vs0, vs1
; CHECK-NEXT: mtfprwz f3, r3
; CHECK-NEXT: xvcvdpsp vs35, vs0
; CHECK-NEXT: xscvhpdp f3, f3
; CHECK-NEXT: xxmrghd vs2, vs2, vs3
; CHECK-NEXT: xvcvdpsp vs34, vs2
; CHECK-NEXT: xvcvdpsp vs35, vs0
; CHECK-NEXT: vmrgew v2, v3, v2
; CHECK-NEXT: blr
;
@ -906,12 +906,12 @@ define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 {
; CHECK-LABEL: test_trunc32_vec4:
; CHECK: # %bb.0:
; CHECK-NEXT: xxsldwi vs0, vs34, vs34, 3
; CHECK-NEXT: xxsldwi vs1, vs34, vs34, 1
; CHECK-NEXT: xscvspdpn f0, vs0
; CHECK-NEXT: xscvspdpn f1, vs1
; CHECK-NEXT: xscvdphp f0, f0
; CHECK-NEXT: mffprwz r3, f0
; CHECK-NEXT: xxswapd vs0, vs34
; CHECK-NEXT: xxsldwi vs1, vs34, vs34, 1
; CHECK-NEXT: xscvspdpn f1, vs1
; CHECK-NEXT: xscvspdpn f0, vs0
; CHECK-NEXT: xscvdphp f0, f0
; CHECK-NEXT: xscvdphp f1, f1
@ -920,8 +920,8 @@ define void @test_trunc32_vec4(<4 x float> %a, <4 x half>* %p) #0 {
; CHECK-NEXT: xscvdphp f1, f1
; CHECK-NEXT: sth r4, 4(r5)
; CHECK-NEXT: mffprwz r4, f0
; CHECK-NEXT: sth r4, 2(r5)
; CHECK-NEXT: sth r3, 0(r5)
; CHECK-NEXT: sth r4, 2(r5)
; CHECK-NEXT: mffprwz r6, f1
; CHECK-NEXT: sth r6, 6(r5)
; CHECK-NEXT: blr
@ -1059,10 +1059,10 @@ define void @test_trunc64_vec4(<4 x double> %a, <4 x half>* %p) #0 {
; CHECK-NEXT: xscvdphp f1, vs34
; CHECK-NEXT: mffprwz r4, f1
; CHECK-NEXT: xscvdphp f1, vs35
; CHECK-NEXT: sth r3, 0(r7)
; CHECK-NEXT: sth r4, 2(r7)
; CHECK-NEXT: mffprwz r4, f0
; CHECK-NEXT: sth r4, 4(r7)
; CHECK-NEXT: sth r3, 0(r7)
; CHECK-NEXT: mffprwz r5, f1
; CHECK-NEXT: sth r5, 6(r7)
; CHECK-NEXT: blr
@ -1169,8 +1169,8 @@ define float @test_sitofp_fadd_i32(i32 %a, half* %b) #0 {
; CHECK-LABEL: test_sitofp_fadd_i32:
; CHECK: # %bb.0:
; CHECK-NEXT: mtfprwa f1, r3
; CHECK-NEXT: xscvsxdsp f1, f1
; CHECK-NEXT: lhz r4, 0(r4)
; CHECK-NEXT: xscvsxdsp f1, f1
; CHECK-NEXT: mtfprwz f0, r4
; CHECK-NEXT: xscvhpdp f0, f0
; CHECK-NEXT: xscvdphp f1, f1

View File

@ -132,8 +132,8 @@ define <4 x i32> @load_swap11(<4 x i32>* %vp1, <4 x i32>* %vp2) {
; CHECK-P9-BE-LABEL: load_swap11:
; CHECK-P9-BE: # %bb.0:
; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha
; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI3_0@toc@l
; CHECK-P9-BE-NEXT: lxv v2, 0(r4)
; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI3_0@toc@l
; CHECK-P9-BE-NEXT: lxvx v3, 0, r3
; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-BE-NEXT: blr
@ -208,8 +208,8 @@ define <8 x i16> @load_swap21(<8 x i16>* %vp1, <8 x i16>* %vp2){
; CHECK-P9-BE-LABEL: load_swap21:
; CHECK-P9-BE: # %bb.0:
; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI5_0@toc@ha
; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI5_0@toc@l
; CHECK-P9-BE-NEXT: lxv v2, 0(r4)
; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI5_0@toc@l
; CHECK-P9-BE-NEXT: lxvx v3, 0, r3
; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-BE-NEXT: blr
@ -382,8 +382,8 @@ define <4 x float> @load_swap51(<4 x float>* %vp1, <4 x float>* %vp2) {
; CHECK-P9-BE-LABEL: load_swap51:
; CHECK-P9-BE: # %bb.0:
; CHECK-P9-BE-NEXT: addis r3, r2, .LCPI10_0@toc@ha
; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI10_0@toc@l
; CHECK-P9-BE-NEXT: lxv v2, 0(r4)
; CHECK-P9-BE-NEXT: addi r3, r3, .LCPI10_0@toc@l
; CHECK-P9-BE-NEXT: lxvx v3, 0, r3
; CHECK-P9-BE-NEXT: vperm v2, v2, v2, v3
; CHECK-P9-BE-NEXT: blr

View File

@ -23,11 +23,11 @@ define i64 @test_no_prep(i8* %0, i32 signext %1) {
; CHECK: .LBB0_2: #
; CHECK-NEXT: ldx r9, r3, r6
; CHECK-NEXT: ldx r10, r3, r7
; CHECK-NEXT: mulld r9, r10, r9
; CHECK-NEXT: ldx r11, r3, r8
; CHECK-NEXT: mulld r9, r9, r11
; CHECK-NEXT: ld r12, 0(r3)
; CHECK-NEXT: addi r3, r3, 1
; CHECK-NEXT: mulld r9, r10, r9
; CHECK-NEXT: mulld r9, r9, r11
; CHECK-NEXT: maddld r5, r9, r12, r5
; CHECK-NEXT: bdnz .LBB0_2
%3 = sext i32 %1 to i64
@ -87,11 +87,11 @@ define i64 @test_ds_prep(i8* %0, i32 signext %1) {
; CHECK: .LBB1_2: #
; CHECK-NEXT: ldx r9, r6, r7
; CHECK-NEXT: ld r10, 0(r6)
; CHECK-NEXT: mulld r9, r10, r9
; CHECK-NEXT: ldx r11, r6, r5
; CHECK-NEXT: mulld r9, r9, r11
; CHECK-NEXT: addi r8, r6, 1
; CHECK-NEXT: ld r6, 4(r6)
; CHECK-NEXT: mulld r9, r10, r9
; CHECK-NEXT: mulld r9, r9, r11
; CHECK-NEXT: maddld r3, r9, r6, r3
; CHECK-NEXT: mr r6, r8
; CHECK-NEXT: bdnz .LBB1_2
@ -162,22 +162,22 @@ define i64 @test_max_number_reminder(i8* %0, i32 signext %1) {
; CHECK: .LBB2_2: #
; CHECK-NEXT: ldx r12, r9, r6
; CHECK-NEXT: ld r0, 0(r9)
; CHECK-NEXT: mulld r12, r0, r12
; CHECK-NEXT: ldx r30, r9, r5
; CHECK-NEXT: ldx r29, r9, r7
; CHECK-NEXT: addi r11, r9, 1
; CHECK-NEXT: ldx r30, r9, r7
; CHECK-NEXT: ld r29, 4(r9)
; CHECK-NEXT: ldx r28, r9, r8
; CHECK-NEXT: ld r27, 12(r9)
; CHECK-NEXT: ld r26, 8(r9)
; CHECK-NEXT: ldx r25, r9, r10
; CHECK-NEXT: ldx r9, r9, r5
; CHECK-NEXT: mulld r9, r12, r9
; CHECK-NEXT: mulld r9, r9, r30
; CHECK-NEXT: mulld r9, r9, r29
; CHECK-NEXT: mulld r9, r9, r28
; CHECK-NEXT: mulld r9, r9, r27
; CHECK-NEXT: mulld r9, r9, r26
; CHECK-NEXT: maddld r3, r9, r25, r3
; CHECK-NEXT: mulld r12, r0, r12
; CHECK-NEXT: ld r28, 4(r9)
; CHECK-NEXT: ldx r27, r9, r8
; CHECK-NEXT: ld r26, 12(r9)
; CHECK-NEXT: ld r25, 8(r9)
; CHECK-NEXT: ldx r9, r9, r10
; CHECK-NEXT: mulld r12, r12, r30
; CHECK-NEXT: mulld r12, r12, r29
; CHECK-NEXT: mulld r12, r12, r28
; CHECK-NEXT: mulld r12, r12, r27
; CHECK-NEXT: mulld r12, r12, r26
; CHECK-NEXT: mulld r12, r12, r25
; CHECK-NEXT: maddld r3, r12, r9, r3
; CHECK-NEXT: mr r9, r11
; CHECK-NEXT: bdnz .LBB2_2
%3 = sext i32 %1 to i64
@ -257,10 +257,10 @@ define dso_local i64 @test_update_ds_prep_interact(i8* %0, i32 signext %1) {
; CHECK: .LBB3_2: #
; CHECK-NEXT: ldu r8, 4(r3)
; CHECK-NEXT: ldx r9, r3, r7
; CHECK-NEXT: mulld r8, r8, r9
; CHECK-NEXT: ldx r10, r3, r6
; CHECK-NEXT: mulld r8, r8, r10
; CHECK-NEXT: ld r11, 4(r3)
; CHECK-NEXT: mulld r8, r8, r9
; CHECK-NEXT: mulld r8, r8, r10
; CHECK-NEXT: maddld r5, r8, r11, r5
; CHECK-NEXT: bdnz .LBB3_2
%3 = sext i32 %1 to i64
@ -391,21 +391,21 @@ define dso_local i64 @test_ds_multiple_chains(i8* %0, i8* %1, i32 signext %2) {
; CHECK: .LBB5_2: #
; CHECK-NEXT: ld r8, 0(r3)
; CHECK-NEXT: ldx r9, r3, r7
; CHECK-NEXT: mulld r8, r9, r8
; CHECK-NEXT: ld r9, 4(r3)
; CHECK-NEXT: mulld r8, r8, r9
; CHECK-NEXT: ld r10, 8(r3)
; CHECK-NEXT: ld r10, 4(r3)
; CHECK-NEXT: ld r11, 8(r3)
; CHECK-NEXT: addi r3, r3, 1
; CHECK-NEXT: mulld r8, r8, r10
; CHECK-NEXT: ld r11, 0(r4)
; CHECK-NEXT: mulld r8, r8, r11
; CHECK-NEXT: ldx r12, r4, r7
; CHECK-NEXT: mulld r8, r8, r12
; CHECK-NEXT: ld r0, 4(r4)
; CHECK-NEXT: mulld r8, r8, r0
; CHECK-NEXT: ld r30, 8(r4)
; CHECK-NEXT: mulld r8, r9, r8
; CHECK-NEXT: ld r12, 0(r4)
; CHECK-NEXT: ldx r0, r4, r7
; CHECK-NEXT: ld r30, 4(r4)
; CHECK-NEXT: ld r9, 8(r4)
; CHECK-NEXT: addi r4, r4, 1
; CHECK-NEXT: maddld r6, r8, r30, r6
; CHECK-NEXT: mulld r8, r8, r10
; CHECK-NEXT: mulld r8, r8, r11
; CHECK-NEXT: mulld r8, r8, r12
; CHECK-NEXT: mulld r8, r8, r0
; CHECK-NEXT: mulld r8, r8, r30
; CHECK-NEXT: maddld r6, r8, r9, r6
; CHECK-NEXT: bdnz .LBB5_2
%4 = sext i32 %2 to i64
%5 = icmp eq i32 %2, 0
@ -710,10 +710,10 @@ define float @test_ds_combine_float_int(i8* %0, i32 signext %1) {
; CHECK-NEXT: lfsx f0, r3, r4
; CHECK-NEXT: xscvuxdsp f4, f4
; CHECK-NEXT: lfs f2, 20(r3)
; CHECK-NEXT: xsmulsp f0, f0, f4
; CHECK-NEXT: xsmulsp f0, f2, f0
; CHECK-NEXT: lfs f3, 60(r3)
; CHECK-NEXT: addi r3, r3, 1
; CHECK-NEXT: xsmulsp f0, f0, f4
; CHECK-NEXT: xsmulsp f0, f2, f0
; CHECK-NEXT: xsmulsp f0, f3, f0
; CHECK-NEXT: xsaddsp f1, f1, f0
; CHECK-NEXT: bdnz .LBB8_2

View File

@ -109,10 +109,10 @@ define dso_local signext i32 @foo(i32 signext %x, i32 signext %y) nounwind {
; CHECK-P9-NEXT: b .LBB1_2
; CHECK-P9-NEXT: .LBB1_7: # %while.end
; CHECK-P9-NEXT: lis r3, -13108
; CHECK-P9-NEXT: ori r3, r3, 52429
; CHECK-P9-NEXT: mullw r3, r28, r3
; CHECK-P9-NEXT: lis r4, 13107
; CHECK-P9-NEXT: ori r3, r3, 52429
; CHECK-P9-NEXT: ori r4, r4, 13108
; CHECK-P9-NEXT: mullw r3, r28, r3
; CHECK-P9-NEXT: cmplw r3, r4
; CHECK-P9-NEXT: blt cr0, .LBB1_9
; CHECK-P9-NEXT: # %bb.8: # %if.then8

View File

@ -1397,10 +1397,10 @@ define void @test_constrained_libcall_multichain(float* %firstptr, ppc_fp128* %r
; PC64LE9-NEXT: li 3, 0
; PC64LE9-NEXT: xxlxor 2, 2, 2
; PC64LE9-NEXT: xxlxor 4, 4, 4
; PC64LE9-NEXT: mr 30, 4
; PC64LE9-NEXT: std 3, 8(4)
; PC64LE9-NEXT: fmr 1, 31
; PC64LE9-NEXT: fmr 3, 31
; PC64LE9-NEXT: mr 30, 4
; PC64LE9-NEXT: stfd 31, 0(4)
; PC64LE9-NEXT: bl __gcc_qadd
; PC64LE9-NEXT: nop

View File

@ -14,8 +14,8 @@ define dso_local void @h() local_unnamed_addr #0 {
; CHECK-NEXT: std 0, 16(1)
; CHECK-NEXT: stdu 1, -64(1)
; CHECK-NEXT: addis 3, 2, g@toc@ha
; CHECK-NEXT: lwz 3, g@toc@l(3)
; CHECK-NEXT: std 30, 48(1) # 8-byte Folded Spill
; CHECK-NEXT: lwz 3, g@toc@l(3)
; CHECK-NEXT: extswsli 30, 3, 2
; CHECK-NEXT: addis 3, 2, f@got@tlsld@ha
; CHECK-NEXT: addi 3, 3, f@got@tlsld@l

View File

@ -20,13 +20,13 @@ define hidden void @julia_tryparse_internal_45896() #0 {
; CHECK-NEXT: .LBB0_6: # %L1057.preheader
; CHECK-NEXT: .LBB0_7: # %L670
; CHECK-NEXT: lis r5, 4095
; CHECK-NEXT: ori r5, r5, 65533
; CHECK-NEXT: sldi r5, r5, 4
; CHECK-NEXT: cmpdi r3, 0
; CHECK-NEXT: sradi r4, r3, 63
; CHECK-NEXT: ori r5, r5, 65533
; CHECK-NEXT: crnot 4*cr5+gt, eq
; CHECK-NEXT: sldi r5, r5, 4
; CHECK-NEXT: mulhdu r3, r3, r5
; CHECK-NEXT: maddld r6, r4, r5, r3
; CHECK-NEXT: crnot 4*cr5+gt, eq
; CHECK-NEXT: cmpld r6, r3
; CHECK-NEXT: mulld r3, r4, r5
; CHECK-NEXT: cmpldi cr1, r3, 0

View File

@ -223,9 +223,9 @@ define <1 x i128> @rotl_28(<1 x i128> %num) {
; P9-NOVSX-NEXT: rldimi r5, r3, 28, 0
; P9-NOVSX-NEXT: rotldi r3, r3, 28
; P9-NOVSX-NEXT: rldimi r3, r4, 28, 0
; P9-NOVSX-NEXT: std r5, -8(r1)
; P9-NOVSX-NEXT: std r3, -16(r1)
; P9-NOVSX-NEXT: addi r3, r1, -16
; P9-NOVSX-NEXT: std r5, -8(r1)
; P9-NOVSX-NEXT: lvx v2, 0, r3
; P9-NOVSX-NEXT: blr
;

View File

@ -13,29 +13,29 @@ define signext i32 @test_pre_inc_disable_1(i8* nocapture readonly %pix1, i32 sig
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxsd v5, 0(r5)
; CHECK-NEXT: addis r5, r2, .LCPI0_0@toc@ha
; CHECK-NEXT: xxlxor v3, v3, v3
; CHECK-NEXT: li r6, 0
; CHECK-NEXT: addi r5, r5, .LCPI0_0@toc@l
; CHECK-NEXT: lxvx v2, 0, r5
; CHECK-NEXT: addis r5, r2, .LCPI0_1@toc@ha
; CHECK-NEXT: addi r5, r5, .LCPI0_1@toc@l
; CHECK-NEXT: lxvx v4, 0, r5
; CHECK-NEXT: li r5, 4
; CHECK-NEXT: xxlxor v3, v3, v3
; CHECK-NEXT: vperm v0, v3, v5, v2
; CHECK-NEXT: mtctr r5
; CHECK-NEXT: li r5, 0
; CHECK-NEXT: vperm v1, v3, v5, v4
; CHECK-NEXT: li r6, 0
; CHECK-NEXT: xvnegsp v5, v0
; CHECK-NEXT: xvnegsp v0, v1
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB0_1: # %for.cond1.preheader
; CHECK-NEXT: #
; CHECK-NEXT: lxsd v1, 0(r3)
; CHECK-NEXT: add r7, r3, r4
; CHECK-NEXT: vperm v6, v3, v1, v4
; CHECK-NEXT: vperm v1, v3, v1, v2
; CHECK-NEXT: xvnegsp v1, v1
; CHECK-NEXT: xvnegsp v6, v6
; CHECK-NEXT: add r7, r3, r4
; CHECK-NEXT: vabsduw v1, v1, v5
; CHECK-NEXT: vabsduw v6, v6, v0
; CHECK-NEXT: vadduwm v1, v6, v1
@ -47,10 +47,11 @@ define signext i32 @test_pre_inc_disable_1(i8* nocapture readonly %pix1, i32 sig
; CHECK-NEXT: vextuwrx r3, r5, v1
; CHECK-NEXT: vperm v7, v3, v6, v4
; CHECK-NEXT: vperm v6, v3, v6, v2
; CHECK-NEXT: add r6, r3, r6
; CHECK-NEXT: add r3, r7, r4
; CHECK-NEXT: xvnegsp v6, v6
; CHECK-NEXT: xvnegsp v1, v7
; CHECK-NEXT: vabsduw v6, v6, v5
; CHECK-NEXT: add r6, r3, r6
; CHECK-NEXT: vabsduw v1, v1, v0
; CHECK-NEXT: vadduwm v1, v1, v6
; CHECK-NEXT: xxswapd v6, v1
@ -58,7 +59,6 @@ define signext i32 @test_pre_inc_disable_1(i8* nocapture readonly %pix1, i32 sig
; CHECK-NEXT: xxspltw v6, v1, 2
; CHECK-NEXT: vadduwm v1, v1, v6
; CHECK-NEXT: vextuwrx r8, r5, v1
; CHECK-NEXT: add r3, r7, r4
; CHECK-NEXT: add r6, r8, r6
; CHECK-NEXT: bdnz .LBB0_1
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
@ -69,25 +69,26 @@ define signext i32 @test_pre_inc_disable_1(i8* nocapture readonly %pix1, i32 sig
; P9BE: # %bb.0: # %entry
; P9BE-NEXT: lfd f0, 0(r5)
; P9BE-NEXT: addis r5, r2, .LCPI0_0@toc@ha
; P9BE-NEXT: xxlxor v3, v3, v3
; P9BE-NEXT: li r6, 0
; P9BE-NEXT: addi r5, r5, .LCPI0_0@toc@l
; P9BE-NEXT: lxvx v2, 0, r5
; P9BE-NEXT: addis r5, r2, .LCPI0_1@toc@ha
; P9BE-NEXT: xxlor v5, vs0, vs0
; P9BE-NEXT: addi r5, r5, .LCPI0_1@toc@l
; P9BE-NEXT: lxvx v4, 0, r5
; P9BE-NEXT: li r5, 4
; P9BE-NEXT: xxlor v5, vs0, vs0
; P9BE-NEXT: xxlxor v3, v3, v3
; P9BE-NEXT: vperm v0, v3, v5, v2
; P9BE-NEXT: mtctr r5
; P9BE-NEXT: li r5, 0
; P9BE-NEXT: vperm v1, v3, v5, v4
; P9BE-NEXT: li r6, 0
; P9BE-NEXT: xvnegsp v5, v0
; P9BE-NEXT: xvnegsp v0, v1
; P9BE-NEXT: .p2align 4
; P9BE-NEXT: .LBB0_1: # %for.cond1.preheader
; P9BE-NEXT: #
; P9BE-NEXT: lfd f0, 0(r3)
; P9BE-NEXT: add r7, r3, r4
; P9BE-NEXT: xxlor v1, vs0, vs0
; P9BE-NEXT: lfdx f0, r3, r4
; P9BE-NEXT: vperm v6, v3, v1, v4
@ -104,20 +105,19 @@ define signext i32 @test_pre_inc_disable_1(i8* nocapture readonly %pix1, i32 sig
; P9BE-NEXT: xxlor v6, vs0, vs0
; P9BE-NEXT: vperm v7, v3, v6, v4
; P9BE-NEXT: vperm v6, v3, v6, v2
; P9BE-NEXT: add r7, r3, r4
; P9BE-NEXT: vextuwlx r3, r5, v1
; P9BE-NEXT: xvnegsp v6, v6
; P9BE-NEXT: add r6, r3, r6
; P9BE-NEXT: xvnegsp v1, v7
; P9BE-NEXT: vabsduw v1, v1, v0
; P9BE-NEXT: add r3, r7, r4
; P9BE-NEXT: vabsduw v6, v6, v5
; P9BE-NEXT: vabsduw v1, v1, v0
; P9BE-NEXT: vadduwm v1, v1, v6
; P9BE-NEXT: xxswapd v6, v1
; P9BE-NEXT: add r6, r3, r6
; P9BE-NEXT: vadduwm v1, v1, v6
; P9BE-NEXT: xxspltw v6, v1, 1
; P9BE-NEXT: vadduwm v1, v1, v6
; P9BE-NEXT: vextuwlx r8, r5, v1
; P9BE-NEXT: add r3, r7, r4
; P9BE-NEXT: add r6, r8, r6
; P9BE-NEXT: bdnz .LBB0_1
; P9BE-NEXT: # %bb.2: # %for.cond.cleanup
@ -180,13 +180,14 @@ define signext i32 @test_pre_inc_disable_2(i8* nocapture readonly %pix1, i8* noc
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lxsd v2, 0(r3)
; CHECK-NEXT: addis r3, r2, .LCPI1_0@toc@ha
; CHECK-NEXT: lxsd v1, 0(r4)
; CHECK-NEXT: xxlxor v3, v3, v3
; CHECK-NEXT: addi r3, r3, .LCPI1_0@toc@l
; CHECK-NEXT: lxvx v4, 0, r3
; CHECK-NEXT: addis r3, r2, .LCPI1_1@toc@ha
; CHECK-NEXT: addi r3, r3, .LCPI1_1@toc@l
; CHECK-NEXT: lxvx v0, 0, r3
; CHECK-NEXT: lxsd v1, 0(r4)
; CHECK-NEXT: xxlxor v3, v3, v3
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: vperm v5, v3, v2, v4
; CHECK-NEXT: vperm v2, v3, v2, v0
; CHECK-NEXT: vperm v0, v3, v1, v0
@ -198,7 +199,6 @@ define signext i32 @test_pre_inc_disable_2(i8* nocapture readonly %pix1, i8* noc
; CHECK-NEXT: vadduwm v2, v2, v3
; CHECK-NEXT: xxspltw v3, v2, 2
; CHECK-NEXT: vadduwm v2, v2, v3
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: vextuwrx r3, r3, v2
; CHECK-NEXT: extsw r3, r3
; CHECK-NEXT: blr
@ -207,6 +207,7 @@ define signext i32 @test_pre_inc_disable_2(i8* nocapture readonly %pix1, i8* noc
; P9BE: # %bb.0: # %entry
; P9BE-NEXT: lfd f0, 0(r3)
; P9BE-NEXT: addis r3, r2, .LCPI1_0@toc@ha
; P9BE-NEXT: xxlxor v3, v3, v3
; P9BE-NEXT: addi r3, r3, .LCPI1_0@toc@l
; P9BE-NEXT: lxvx v4, 0, r3
; P9BE-NEXT: addis r3, r2, .LCPI1_1@toc@ha
@ -214,8 +215,8 @@ define signext i32 @test_pre_inc_disable_2(i8* nocapture readonly %pix1, i8* noc
; P9BE-NEXT: xxlor v2, vs0, vs0
; P9BE-NEXT: lfd f0, 0(r4)
; P9BE-NEXT: lxvx v0, 0, r3
; P9BE-NEXT: xxlxor v3, v3, v3
; P9BE-NEXT: xxlor v1, vs0, vs0
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vperm v5, v3, v2, v4
; P9BE-NEXT: vperm v2, v3, v2, v0
; P9BE-NEXT: vperm v0, v3, v1, v0
@ -227,7 +228,6 @@ define signext i32 @test_pre_inc_disable_2(i8* nocapture readonly %pix1, i8* noc
; P9BE-NEXT: vadduwm v2, v2, v3
; P9BE-NEXT: xxspltw v3, v2, 1
; P9BE-NEXT: vadduwm v2, v2, v3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuwlx r3, r3, v2
; P9BE-NEXT: extsw r3, r3
; P9BE-NEXT: blr
@ -283,11 +283,11 @@ define void @test32(i8* nocapture readonly %pix2, i32 signext %i_pix2) {
; CHECK-NEXT: add r5, r3, r4
; CHECK-NEXT: lxsiwzx v2, r3, r4
; CHECK-NEXT: addis r3, r2, .LCPI2_0@toc@ha
; CHECK-NEXT: xxlxor v3, v3, v3
; CHECK-NEXT: addi r3, r3, .LCPI2_0@toc@l
; CHECK-NEXT: lxvx v4, 0, r3
; CHECK-NEXT: li r3, 4
; CHECK-NEXT: lxsiwzx v5, r5, r3
; CHECK-NEXT: xxlxor v3, v3, v3
; CHECK-NEXT: vperm v2, v2, v3, v4
; CHECK-NEXT: vperm v3, v5, v3, v4
; CHECK-NEXT: vspltisw v4, 8
@ -304,12 +304,12 @@ define void @test32(i8* nocapture readonly %pix2, i32 signext %i_pix2) {
; P9BE-NEXT: add r5, r3, r4
; P9BE-NEXT: lfiwzx f0, r3, r4
; P9BE-NEXT: addis r3, r2, .LCPI2_0@toc@ha
; P9BE-NEXT: xxlxor v3, v3, v3
; P9BE-NEXT: xxsldwi v2, f0, f0, 1
; P9BE-NEXT: addi r3, r3, .LCPI2_0@toc@l
; P9BE-NEXT: lxvx v4, 0, r3
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: xxsldwi v2, f0, f0, 1
; P9BE-NEXT: lfiwzx f0, r5, r3
; P9BE-NEXT: xxlxor v3, v3, v3
; P9BE-NEXT: vperm v2, v3, v2, v4
; P9BE-NEXT: xxsldwi v5, f0, f0, 1
; P9BE-NEXT: vperm v3, v3, v5, v4
@ -349,16 +349,16 @@ define void @test16(i16* nocapture readonly %sums, i32 signext %delta, i32 signe
; CHECK-LABEL: test16:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: sldi r4, r4, 1
; CHECK-NEXT: add r6, r3, r4
; CHECK-NEXT: li r7, 16
; CHECK-NEXT: lxsihzx v2, r6, r7
; CHECK-NEXT: add r6, r3, r4
; CHECK-NEXT: lxsihzx v4, r3, r4
; CHECK-NEXT: addis r3, r2, .LCPI3_0@toc@ha
; CHECK-NEXT: lxsihzx v2, r6, r7
; CHECK-NEXT: li r6, 0
; CHECK-NEXT: addi r3, r3, .LCPI3_0@toc@l
; CHECK-NEXT: mtvsrd v3, r6
; CHECK-NEXT: vsplth v4, v4, 3
; CHECK-NEXT: vsplth v2, v2, 3
; CHECK-NEXT: addis r3, r2, .LCPI3_0@toc@ha
; CHECK-NEXT: addi r3, r3, .LCPI3_0@toc@l
; CHECK-NEXT: vmrghh v4, v3, v4
; CHECK-NEXT: vmrghh v2, v3, v2
; CHECK-NEXT: vsplth v3, v3, 3
@ -376,17 +376,17 @@ define void @test16(i16* nocapture readonly %sums, i32 signext %delta, i32 signe
; P9BE-LABEL: test16:
; P9BE: # %bb.0: # %entry
; P9BE-NEXT: sldi r4, r4, 1
; P9BE-NEXT: add r6, r3, r4
; P9BE-NEXT: li r7, 16
; P9BE-NEXT: lxsihzx v2, r6, r7
; P9BE-NEXT: add r6, r3, r4
; P9BE-NEXT: lxsihzx v4, r3, r4
; P9BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha
; P9BE-NEXT: lxsihzx v2, r6, r7
; P9BE-NEXT: li r6, 0
; P9BE-NEXT: addi r3, r3, .LCPI3_0@toc@l
; P9BE-NEXT: sldi r6, r6, 48
; P9BE-NEXT: vsplth v4, v4, 3
; P9BE-NEXT: mtvsrd v3, r6
; P9BE-NEXT: vsplth v2, v2, 3
; P9BE-NEXT: addis r3, r2, .LCPI3_0@toc@ha
; P9BE-NEXT: addi r3, r3, .LCPI3_0@toc@l
; P9BE-NEXT: vmrghh v4, v3, v4
; P9BE-NEXT: vmrghh v2, v3, v2
; P9BE-NEXT: vsplth v3, v3, 0
@ -441,11 +441,11 @@ define void @test8(i8* nocapture readonly %sums, i32 signext %delta, i32 signext
; CHECK-NEXT: mtvsrd v3, r3
; CHECK-NEXT: li r3, 8
; CHECK-NEXT: lxsibzx v5, r6, r3
; CHECK-NEXT: addis r3, r2, .LCPI4_0@toc@ha
; CHECK-NEXT: addi r3, r3, .LCPI4_0@toc@l
; CHECK-NEXT: vspltb v2, v2, 7
; CHECK-NEXT: vmrghb v2, v3, v2
; CHECK-NEXT: vspltb v4, v3, 7
; CHECK-NEXT: addis r3, r2, .LCPI4_0@toc@ha
; CHECK-NEXT: vspltb v2, v2, 7
; CHECK-NEXT: addi r3, r3, .LCPI4_0@toc@l
; CHECK-NEXT: vmrghb v2, v3, v2
; CHECK-NEXT: vspltb v5, v5, 7
; CHECK-NEXT: vmrglh v2, v2, v4
; CHECK-NEXT: vmrghb v3, v3, v5
@ -466,9 +466,11 @@ define void @test8(i8* nocapture readonly %sums, i32 signext %delta, i32 signext
; P9BE: # %bb.0: # %entry
; P9BE-NEXT: add r6, r3, r4
; P9BE-NEXT: li r7, 8
; P9BE-NEXT: lxsibzx v2, r6, r7
; P9BE-NEXT: lxsibzx v4, r3, r4
; P9BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha
; P9BE-NEXT: lxsibzx v2, r6, r7
; P9BE-NEXT: li r6, 0
; P9BE-NEXT: addi r3, r3, .LCPI4_0@toc@l
; P9BE-NEXT: sldi r6, r6, 56
; P9BE-NEXT: vspltb v4, v4, 7
; P9BE-NEXT: mtvsrd v3, r6
@ -476,8 +478,6 @@ define void @test8(i8* nocapture readonly %sums, i32 signext %delta, i32 signext
; P9BE-NEXT: vmrghb v4, v3, v4
; P9BE-NEXT: vmrghb v2, v3, v2
; P9BE-NEXT: vspltb v3, v3, 0
; P9BE-NEXT: addis r3, r2, .LCPI4_0@toc@ha
; P9BE-NEXT: addi r3, r3, .LCPI4_0@toc@l
; P9BE-NEXT: vmrghh v4, v4, v3
; P9BE-NEXT: xxspltw v3, v3, 0
; P9BE-NEXT: vmrghw v2, v4, v2

View File

@ -804,8 +804,8 @@ define double @foo3_fmf(double %a) nounwind {
; CHECK-P9-LABEL: foo3_fmf:
; CHECK-P9: # %bb.0:
; CHECK-P9-NEXT: addis 3, 2, .LCPI20_2@toc@ha
; CHECK-P9-NEXT: lfd 2, .LCPI20_2@toc@l(3)
; CHECK-P9-NEXT: xsabsdp 0, 1
; CHECK-P9-NEXT: lfd 2, .LCPI20_2@toc@l(3)
; CHECK-P9-NEXT: xscmpudp 0, 0, 2
; CHECK-P9-NEXT: xxlxor 0, 0, 0
; CHECK-P9-NEXT: blt 0, .LBB20_2
@ -899,8 +899,8 @@ define float @goo3_fmf(float %a) nounwind {
; CHECK-P9-LABEL: goo3_fmf:
; CHECK-P9: # %bb.0:
; CHECK-P9-NEXT: addis 3, 2, .LCPI22_2@toc@ha
; CHECK-P9-NEXT: lfs 2, .LCPI22_2@toc@l(3)
; CHECK-P9-NEXT: xsabsdp 0, 1
; CHECK-P9-NEXT: lfs 2, .LCPI22_2@toc@l(3)
; CHECK-P9-NEXT: fcmpu 0, 0, 2
; CHECK-P9-NEXT: xxlxor 0, 0, 0
; CHECK-P9-NEXT: blt 0, .LBB22_2

View File

@ -28,69 +28,80 @@ define zeroext i32 @test1(i64 %0, i64* %1) {
; CHECK-NEXT: .cfi_offset r31, -8
; CHECK-NEXT: .cfi_offset r2, -152
; CHECK-NEXT: lis 5, 4
; CHECK-NEXT: std 30, 704(1) # 8-byte Folded Spill
; CHECK-NEXT: std 29, 696(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 6, 5, 6292
; CHECK-NEXT: std 28, 688(1) # 8-byte Folded Spill
; CHECK-NEXT: std 27, 680(1) # 8-byte Folded Spill
; CHECK-NEXT: std 26, 672(1) # 8-byte Folded Spill
; CHECK-NEXT: std 25, 664(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 5, 5, 6291
; CHECK-NEXT: std 14, 576(1) # 8-byte Folded Spill
; CHECK-NEXT: std 15, 584(1) # 8-byte Folded Spill
; CHECK-NEXT: std 16, 592(1) # 8-byte Folded Spill
; CHECK-NEXT: std 17, 600(1) # 8-byte Folded Spill
; CHECK-NEXT: std 18, 608(1) # 8-byte Folded Spill
; CHECK-NEXT: std 19, 616(1) # 8-byte Folded Spill
; CHECK-NEXT: std 20, 624(1) # 8-byte Folded Spill
; CHECK-NEXT: std 21, 632(1) # 8-byte Folded Spill
; CHECK-NEXT: std 22, 640(1) # 8-byte Folded Spill
; CHECK-NEXT: std 23, 648(1) # 8-byte Folded Spill
; CHECK-NEXT: std 24, 656(1) # 8-byte Folded Spill
; CHECK-NEXT: std 31, 712(1) # 8-byte Folded Spill
; CHECK-NEXT: std 2, 568(1) # 8-byte Folded Spill
; CHECK-NEXT: sldi 6, 6, 32
; CHECK-NEXT: oris 7, 6, 13030
; CHECK-NEXT: oris 8, 6, 13066
; CHECK-NEXT: ori 7, 7, 3704
; CHECK-NEXT: oris 9, 6, 13054
; CHECK-NEXT: oris 10, 6, 13042
; CHECK-NEXT: oris 11, 6, 13078
; CHECK-NEXT: oris 12, 6, 13115
; CHECK-NEXT: oris 0, 6, 13103
; CHECK-NEXT: oris 30, 6, 13091
; CHECK-NEXT: oris 29, 6, 13127
; CHECK-NEXT: oris 28, 6, 13164
; CHECK-NEXT: oris 27, 6, 13152
; CHECK-NEXT: oris 26, 6, 13139
; CHECK-NEXT: oris 25, 6, 13176
; CHECK-NEXT: ori 7, 7, 3704
; CHECK-NEXT: ori 8, 8, 44408
; CHECK-NEXT: ori 9, 9, 30840
; CHECK-NEXT: add 7, 4, 7
; CHECK-NEXT: oris 10, 6, 13042
; CHECK-NEXT: ori 10, 10, 17272
; CHECK-NEXT: std 7, 384(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 8
; CHECK-NEXT: oris 11, 6, 13078
; CHECK-NEXT: ori 11, 11, 57976
; CHECK-NEXT: std 7, 376(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 9
; CHECK-NEXT: oris 12, 6, 13115
; CHECK-NEXT: ori 12, 12, 33144
; CHECK-NEXT: std 7, 368(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 10
; CHECK-NEXT: oris 0, 6, 13103
; CHECK-NEXT: ori 0, 0, 19576
; CHECK-NEXT: std 7, 360(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 11
; CHECK-NEXT: std 30, 704(1) # 8-byte Folded Spill
; CHECK-NEXT: oris 30, 6, 13091
; CHECK-NEXT: ori 30, 30, 6008
; CHECK-NEXT: std 7, 352(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 12
; CHECK-NEXT: std 29, 696(1) # 8-byte Folded Spill
; CHECK-NEXT: oris 29, 6, 13127
; CHECK-NEXT: ori 29, 29, 46712
; CHECK-NEXT: ori 28, 28, 21880
; CHECK-NEXT: ori 27, 27, 8312
; CHECK-NEXT: ori 26, 26, 60280
; CHECK-NEXT: ori 25, 25, 35448
; CHECK-NEXT: add 7, 4, 7
; CHECK-NEXT: sldi 5, 5, 32
; CHECK-NEXT: oris 5, 5, 29347
; CHECK-NEXT: ori 5, 5, 20088
; CHECK-NEXT: std 7, 384(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 8
; CHECK-NEXT: lis 8, 402
; CHECK-NEXT: std 7, 376(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 9
; CHECK-NEXT: lis 9, 451
; CHECK-NEXT: std 7, 368(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 10
; CHECK-NEXT: lis 10, 500
; CHECK-NEXT: std 7, 360(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 11
; CHECK-NEXT: lis 11, 549
; CHECK-NEXT: std 31, 712(1) # 8-byte Folded Spill
; CHECK-NEXT: std 2, 568(1) # 8-byte Folded Spill
; CHECK-NEXT: std 7, 352(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 12
; CHECK-NEXT: std 7, 344(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 0
; CHECK-NEXT: std 28, 688(1) # 8-byte Folded Spill
; CHECK-NEXT: oris 28, 6, 13164
; CHECK-NEXT: ori 28, 28, 21880
; CHECK-NEXT: std 7, 336(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 30
; CHECK-NEXT: std 27, 680(1) # 8-byte Folded Spill
; CHECK-NEXT: oris 27, 6, 13152
; CHECK-NEXT: ori 27, 27, 8312
; CHECK-NEXT: std 7, 328(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 29
; CHECK-NEXT: std 26, 672(1) # 8-byte Folded Spill
; CHECK-NEXT: oris 26, 6, 13139
; CHECK-NEXT: ori 26, 26, 60280
; CHECK-NEXT: std 7, 320(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 28
; CHECK-NEXT: std 25, 664(1) # 8-byte Folded Spill
; CHECK-NEXT: oris 25, 6, 13176
; CHECK-NEXT: ori 25, 25, 35448
; CHECK-NEXT: std 7, 312(1) # 8-byte Folded Spill
; CHECK-NEXT: add 7, 4, 27
; CHECK-NEXT: std 7, 304(1) # 8-byte Folded Spill
@ -112,6 +123,10 @@ define zeroext i32 @test1(i64 %0, i64* %1) {
; CHECK-NEXT: lis 5, 268
; CHECK-NEXT: std 4, 256(1) # 8-byte Folded Spill
; CHECK-NEXT: lis 4, 585
; CHECK-NEXT: std 6, 264(1) # 8-byte Folded Spill
; CHECK-NEXT: lis 6, 305
; CHECK-NEXT: std 7, 272(1) # 8-byte Folded Spill
; CHECK-NEXT: lis 7, 354
; CHECK-NEXT: ori 4, 4, 61440
; CHECK-NEXT: std 4, 560(1) # 8-byte Folded Spill
; CHECK-NEXT: lis 4, 48
@ -200,94 +215,79 @@ define zeroext i32 @test1(i64 %0, i64* %1) {
; CHECK-NEXT: std 4, 192(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 36352
; CHECK-NEXT: lis 5, 317
; CHECK-NEXT: ld 30, 192(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 184(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 25088
; CHECK-NEXT: lis 5, 366
; CHECK-NEXT: ld 29, 184(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 176(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 13824
; CHECK-NEXT: lis 5, 415
; CHECK-NEXT: ld 28, 176(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 168(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 2560
; CHECK-NEXT: lis 5, 463
; CHECK-NEXT: ld 27, 168(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 160(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 56832
; CHECK-NEXT: lis 5, 512
; CHECK-NEXT: ld 26, 160(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 152(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 45568
; CHECK-NEXT: lis 5, 561
; CHECK-NEXT: ld 25, 152(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 144(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 34304
; CHECK-NEXT: lis 5, 12
; CHECK-NEXT: ld 24, 144(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 136(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 13568
; CHECK-NEXT: lis 5, 61
; CHECK-NEXT: ld 23, 136(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 128(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 2304
; CHECK-NEXT: lis 5, 109
; CHECK-NEXT: std 4, 120(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 56576
; CHECK-NEXT: lis 5, 158
; CHECK-NEXT: ld 0, 120(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 112(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 45312
; CHECK-NEXT: lis 5, 207
; CHECK-NEXT: ld 22, 112(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 104(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 34048
; CHECK-NEXT: lis 5, 256
; CHECK-NEXT: std 6, 264(1) # 8-byte Folded Spill
; CHECK-NEXT: lis 6, 305
; CHECK-NEXT: ld 30, 192(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 29, 184(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 28, 176(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 27, 168(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 26, 160(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 25, 152(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 0, 120(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 21, 104(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 96(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 5, 22784
; CHECK-NEXT: std 7, 272(1) # 8-byte Folded Spill
; CHECK-NEXT: lis 7, 354
; CHECK-NEXT: ld 5, 248(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 20, 96(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 88(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 6, 11520
; CHECK-NEXT: ld 6, 240(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 19, 88(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 80(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 7, 256
; CHECK-NEXT: ld 7, 232(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 18, 80(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 72(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 8, 54528
; CHECK-NEXT: ld 8, 224(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 17, 72(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 64(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 9, 43264
; CHECK-NEXT: ld 9, 216(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 16, 64(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 56(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 10, 32000
; CHECK-NEXT: ld 10, 208(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 15, 56(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 48(1) # 8-byte Folded Spill
; CHECK-NEXT: ori 4, 11, 20736
; CHECK-NEXT: ld 11, 200(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 40(1) # 8-byte Folded Spill
; CHECK-NEXT: std 14, 576(1) # 8-byte Folded Spill
; CHECK-NEXT: std 15, 584(1) # 8-byte Folded Spill
; CHECK-NEXT: std 16, 592(1) # 8-byte Folded Spill
; CHECK-NEXT: std 17, 600(1) # 8-byte Folded Spill
; CHECK-NEXT: std 18, 608(1) # 8-byte Folded Spill
; CHECK-NEXT: std 19, 616(1) # 8-byte Folded Spill
; CHECK-NEXT: std 20, 624(1) # 8-byte Folded Spill
; CHECK-NEXT: std 21, 632(1) # 8-byte Folded Spill
; CHECK-NEXT: std 22, 640(1) # 8-byte Folded Spill
; CHECK-NEXT: std 23, 648(1) # 8-byte Folded Spill
; CHECK-NEXT: std 24, 656(1) # 8-byte Folded Spill
; CHECK-NEXT: ld 5, 248(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 24, 144(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 23, 136(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 22, 112(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 21, 104(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 20, 96(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 19, 88(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 18, 80(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 17, 72(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 16, 64(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 15, 56(1) # 8-byte Folded Reload
; CHECK-NEXT: ld 14, 48(1) # 8-byte Folded Reload
; CHECK-NEXT: std 4, 40(1) # 8-byte Folded Spill
; CHECK-NEXT: li 4, 0
; CHECK-NEXT: ld 31, 40(1) # 8-byte Folded Reload
; CHECK-NEXT: .p2align 4
@ -305,6 +305,32 @@ define zeroext i32 @test1(i64 %0, i64* %1) {
; CHECK-NEXT: stdux 3, 12, 2
; CHECK-NEXT: ld 2, 552(1) # 8-byte Folded Reload
; CHECK-NEXT: stdx 3, 12, 5
; CHECK-NEXT: stdx 3, 12, 6
; CHECK-NEXT: stdx 3, 12, 7
; CHECK-NEXT: stdx 3, 12, 8
; CHECK-NEXT: stdx 3, 12, 9
; CHECK-NEXT: stdx 3, 12, 10
; CHECK-NEXT: stdx 3, 12, 11
; CHECK-NEXT: stdx 3, 12, 30
; CHECK-NEXT: stdx 3, 12, 29
; CHECK-NEXT: stdx 3, 12, 28
; CHECK-NEXT: stdx 3, 12, 27
; CHECK-NEXT: stdx 3, 12, 26
; CHECK-NEXT: stdx 3, 12, 25
; CHECK-NEXT: stdx 3, 12, 24
; CHECK-NEXT: stdx 3, 12, 23
; CHECK-NEXT: stdx 3, 12, 4
; CHECK-NEXT: stdx 3, 12, 0
; CHECK-NEXT: stdx 3, 12, 22
; CHECK-NEXT: stdx 3, 12, 21
; CHECK-NEXT: stdx 3, 12, 20
; CHECK-NEXT: stdx 3, 12, 19
; CHECK-NEXT: stdx 3, 12, 18
; CHECK-NEXT: stdx 3, 12, 17
; CHECK-NEXT: stdx 3, 12, 16
; CHECK-NEXT: stdx 3, 12, 15
; CHECK-NEXT: stdx 3, 12, 14
; CHECK-NEXT: stdx 3, 12, 31
; CHECK-NEXT: stdx 3, 12, 2
; CHECK-NEXT: ld 2, 544(1) # 8-byte Folded Reload
; CHECK-NEXT: stdx 3, 12, 2
@ -344,35 +370,11 @@ define zeroext i32 @test1(i64 %0, i64* %1) {
; CHECK-NEXT: stdx 3, 12, 2
; CHECK-NEXT: ld 2, 400(1) # 8-byte Folded Reload
; CHECK-NEXT: stdx 3, 12, 2
; CHECK-NEXT: stdx 3, 12, 6
; CHECK-NEXT: stdx 3, 12, 7
; CHECK-NEXT: stdx 3, 12, 8
; CHECK-NEXT: stdx 3, 12, 9
; CHECK-NEXT: stdx 3, 12, 10
; CHECK-NEXT: stdx 3, 12, 11
; CHECK-NEXT: stdx 3, 12, 30
; CHECK-NEXT: stdx 3, 12, 29
; CHECK-NEXT: stdx 3, 12, 28
; CHECK-NEXT: stdx 3, 12, 27
; CHECK-NEXT: stdx 3, 12, 26
; CHECK-NEXT: stdx 3, 12, 25
; CHECK-NEXT: stdx 3, 12, 24
; CHECK-NEXT: stdx 3, 12, 23
; CHECK-NEXT: stdx 3, 12, 4
; CHECK-NEXT: stdx 3, 12, 0
; CHECK-NEXT: stdx 3, 12, 22
; CHECK-NEXT: stdx 3, 12, 21
; CHECK-NEXT: stdx 3, 12, 20
; CHECK-NEXT: stdx 3, 12, 19
; CHECK-NEXT: stdx 3, 12, 18
; CHECK-NEXT: stdx 3, 12, 17
; CHECK-NEXT: stdx 3, 12, 16
; CHECK-NEXT: stdx 3, 12, 15
; CHECK-NEXT: stdx 3, 12, 14
; CHECK-NEXT: stdx 3, 12, 31
; CHECK-NEXT: bdnz .LBB0_2
; CHECK-NEXT: # %bb.3:
; CHECK-NEXT: ld 12, 384(1) # 8-byte Folded Reload
; CHECK-NEXT: lwz 4, 396(1) # 4-byte Folded Reload
; CHECK-NEXT: addi 4, 4, 1
; CHECK-NEXT: std 3, 0(12)
; CHECK-NEXT: ld 12, 376(1) # 8-byte Folded Reload
; CHECK-NEXT: std 3, 0(12)
@ -399,8 +401,6 @@ define zeroext i32 @test1(i64 %0, i64* %1) {
; CHECK-NEXT: ld 12, 288(1) # 8-byte Folded Reload
; CHECK-NEXT: std 3, 0(12)
; CHECK-NEXT: ld 12, 280(1) # 8-byte Folded Reload
; CHECK-NEXT: lwz 4, 396(1) # 4-byte Folded Reload
; CHECK-NEXT: addi 4, 4, 1
; CHECK-NEXT: std 3, 0(12)
; CHECK-NEXT: ld 12, 272(1) # 8-byte Folded Reload
; CHECK-NEXT: std 3, 0(12)

View File

@ -40,8 +40,8 @@ define void @redundancy_on_ppc_and_other_targets() nounwind {
; PPC64LE-NEXT: std 0, 16(1)
; PPC64LE-NEXT: stdu 1, -32(1)
; PPC64LE-NEXT: addis 3, 2, .LC0@toc@ha
; PPC64LE-NEXT: ld 3, .LC0@toc@l(3)
; PPC64LE-NEXT: li 4, 0
; PPC64LE-NEXT: ld 3, .LC0@toc@l(3)
; PPC64LE-NEXT: std 4, 0(3)
; PPC64LE-NEXT: bl barney.94
; PPC64LE-NEXT: nop

View File

@ -166,8 +166,8 @@ define <2 x float> @s2v_test_f2(float* nocapture readonly %f64, <2 x float> %vec
; P9LE-LABEL: s2v_test_f2:
; P9LE: # %bb.0: # %entry
; P9LE-NEXT: addi r3, r3, 4
; P9LE-NEXT: lxsiwzx v3, 0, r3
; P9LE-NEXT: vmrglw v2, v2, v2
; P9LE-NEXT: lxsiwzx v3, 0, r3
; P9LE-NEXT: vmrghw v2, v2, v3
; P9LE-NEXT: blr
@ -208,17 +208,17 @@ define <2 x float> @s2v_test_f3(float* nocapture readonly %f64, <2 x float> %vec
; P9LE-LABEL: s2v_test_f3:
; P9LE: # %bb.0: # %entry
; P9LE-NEXT: sldi r4, r7, 2
; P9LE-NEXT: lxsiwzx v3, r3, r4
; P9LE-NEXT: vmrglw v2, v2, v2
; P9LE-NEXT: lxsiwzx v3, r3, r4
; P9LE-NEXT: vmrghw v2, v2, v3
; P9LE-NEXT: blr
; P9BE-LABEL: s2v_test_f3:
; P9BE: # %bb.0: # %entry
; P9BE: sldi r4, r7, 2
; P9BE: lfiwzx f0, r3, r4
; P9BE-DAG: lfiwzx f0, r3, r4
; P9BE-DAG: xxspltw v2, v2, 1
; P9BE-DAG: xxsldwi v3, f0, f0, 1
; P9BE: xxsldwi v3, f0, f0, 1
; P9BE: vmrghw v2, v3, v2
; P9BE-NEXT: blr
@ -251,17 +251,17 @@ define <2 x float> @s2v_test_f4(float* nocapture readonly %f64, <2 x float> %vec
; P9LE-LABEL: s2v_test_f4:
; P9LE: # %bb.0: # %entry
; P9LE-NEXT: addi r3, r3, 4
; P9LE-NEXT: lxsiwzx v3, 0, r3
; P9LE-NEXT: vmrglw v2, v2, v2
; P9LE-NEXT: lxsiwzx v3, 0, r3
; P9LE-NEXT: vmrghw v2, v2, v3
; P9LE-NEXT: blr
; P9BE-LABEL: s2v_test_f4:
; P9BE: # %bb.0: # %entry
; P9BE: addi r3, r3, 4
; P9BE: lfiwzx f0, 0, r3
; P9BE-DAG: lfiwzx f0, 0, r3
; P9BE-DAG: xxspltw v2, v2, 1
; P9BE-DAG: xxsldwi v3, f0, f0, 1
; P9BE: xxsldwi v3, f0, f0, 1
; P9BE: vmrghw v2, v3, v2
; P9BE-NEXT: blr

View File

@ -18,9 +18,9 @@ define void @test([0 x %_elem_type_of_x]* noalias %.x, [0 x %_elem_type_of_a]* %
; CHECK-P9-NEXT: addi 6, 6, 16
; CHECK-P9-NEXT: rldicr 5, 5, 0, 58
; CHECK-P9-NEXT: addi 5, 5, -32
; CHECK-P9-NEXT: lxvdsx 0, 0, 6
; CHECK-P9-NEXT: rldicl 5, 5, 59, 5
; CHECK-P9-NEXT: addi 5, 5, 1
; CHECK-P9-NEXT: lxvdsx 0, 0, 6
; CHECK-P9-NEXT: mtctr 5
; CHECK-P9-NEXT: .p2align 4
; CHECK-P9-NEXT: .LBB0_1: # %vector.body
@ -36,13 +36,13 @@ define void @test([0 x %_elem_type_of_x]* noalias %.x, [0 x %_elem_type_of_a]* %
; CHECK-P9-NEXT: xvmuldp 4, 4, 0
; CHECK-P9-NEXT: xvmuldp 3, 3, 0
; CHECK-P9-NEXT: xvmuldp 5, 5, 0
; CHECK-P9-NEXT: addi 4, 4, 256
; CHECK-P9-NEXT: xvmuldp 6, 6, 0
; CHECK-P9-NEXT: stxv 1, 16(3)
; CHECK-P9-NEXT: stxv 2, 0(3)
; CHECK-P9-NEXT: stxv 3, 48(3)
; CHECK-P9-NEXT: stxv 4, 32(3)
; CHECK-P9-NEXT: stxv 5, 240(3)
; CHECK-P9-NEXT: addi 4, 4, 256
; CHECK-P9-NEXT: xvmuldp 6, 6, 0
; CHECK-P9-NEXT: stxv 2, 0(3)
; CHECK-P9-NEXT: stxv 6, 224(3)
; CHECK-P9-NEXT: addi 3, 3, 256
; CHECK-P9-NEXT: bdnz .LBB0_1
@ -57,9 +57,9 @@ define void @test([0 x %_elem_type_of_x]* noalias %.x, [0 x %_elem_type_of_a]* %
; CHECK-P9-NO-HEURISTIC-NEXT: rldicr 5, 5, 0, 58
; CHECK-P9-NO-HEURISTIC-NEXT: addi 6, 6, 16
; CHECK-P9-NO-HEURISTIC-NEXT: addi 5, 5, -32
; CHECK-P9-NO-HEURISTIC-NEXT: lxvdsx 0, 0, 6
; CHECK-P9-NO-HEURISTIC-NEXT: rldicl 5, 5, 59, 5
; CHECK-P9-NO-HEURISTIC-NEXT: addi 5, 5, 1
; CHECK-P9-NO-HEURISTIC-NEXT: lxvdsx 0, 0, 6
; CHECK-P9-NO-HEURISTIC-NEXT: mtctr 5
; CHECK-P9-NO-HEURISTIC-NEXT: .p2align 4
; CHECK-P9-NO-HEURISTIC-NEXT: .LBB0_1: # %vector.body
@ -76,13 +76,13 @@ define void @test([0 x %_elem_type_of_x]* noalias %.x, [0 x %_elem_type_of_a]* %
; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 3, 3, 0
; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 6, 6, 0
; CHECK-P9-NO-HEURISTIC-NEXT: xvmuldp 5, 5, 0
; CHECK-P9-NO-HEURISTIC-NEXT: addi 4, 4, 256
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 1, 16(3)
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 2, 0(3)
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 3, 48(3)
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 4, 32(3)
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 5, 240(3)
; CHECK-P9-NO-HEURISTIC-NEXT: stxv 6, 224(3)
; CHECK-P9-NO-HEURISTIC-NEXT: addi 4, 4, 256
; CHECK-P9-NO-HEURISTIC-NEXT: addi 3, 3, 256
; CHECK-P9-NO-HEURISTIC-NEXT: bdnz .LBB0_1
; CHECK-P9-NO-HEURISTIC-NEXT: # %bb.2: # %return.block

View File

@ -22,10 +22,10 @@ define void @print_res() nounwind {
; CHECK-NEXT: isellt 3, 3, 4
; CHECK-NEXT: li 4, 0
; CHECK-NEXT: addi 3, 3, 1
; CHECK-NEXT: mtctr 3
; CHECK-NEXT: li 3, 1
; CHECK-NEXT: li 7, -1
; CHECK-NEXT: li 5, 0
; CHECK-NEXT: mtctr 3
; CHECK-NEXT: li 3, 1
; CHECK-NEXT: lbz 5, 0(5)
; CHECK-NEXT: bdz .LBB0_6
; CHECK-NEXT: # %bb.1:
@ -62,23 +62,23 @@ define void @print_res() nounwind {
; CHECK-NEXT: add 4, 4, 6
; CHECK-NEXT: .LBB0_6:
; CHECK-NEXT: xori 5, 5, 84
; CHECK-NEXT: cntlzw 5, 5
; CHECK-NEXT: clrldi 3, 3, 32
; CHECK-NEXT: std 3, 104(1)
; CHECK-NEXT: addis 3, 2, .LC0@toc@ha
; CHECK-NEXT: ld 3, .LC0@toc@l(3)
; CHECK-NEXT: li 7, 0
; CHECK-NEXT: li 8, 3
; CHECK-NEXT: std 3, 104(1)
; CHECK-NEXT: cntlzw 5, 5
; CHECK-NEXT: addis 3, 2, .LC0@toc@ha
; CHECK-NEXT: li 10, 0
; CHECK-NEXT: ld 3, .LC0@toc@l(3)
; CHECK-NEXT: srwi 5, 5, 5
; CHECK-NEXT: add 4, 4, 5
; CHECK-NEXT: li 5, 0
; CHECK-NEXT: std 5, 120(1)
; CHECK-NEXT: li 5, 3
; CHECK-NEXT: std 5, 96(1)
; CHECK-NEXT: clrldi 6, 4, 32
; CHECK-NEXT: li 4, 3
; CHECK-NEXT: std 5, 96(1)
; CHECK-NEXT: li 5, 0
; CHECK-NEXT: li 10, 0
; CHECK-NEXT: bl printf
; CHECK-NEXT: nop
%1 = load i32, i32* undef, align 4

View File

@ -7,8 +7,8 @@ define void @lame_encode_buffer_interleaved() local_unnamed_addr {
; CHECK: # %bb.0:
; CHECK-NEXT: lha 3, 0(3)
; CHECK-NEXT: li 5, 1
; CHECK-NEXT: sldi 5, 5, 62
; CHECK-NEXT: lhz 4, 0(0)
; CHECK-NEXT: sldi 5, 5, 62
; CHECK-NEXT: mtctr 5
; CHECK-NEXT: srawi 3, 3, 1
; CHECK-NEXT: addze 3, 3

View File

@ -21,9 +21,9 @@ define void @phi3(i32*) nounwind {
; CHECK-NEXT: nop
; CHECK-NEXT: addi 7, 30, -4
; CHECK-NEXT: mtctr 3
; CHECK-NEXT: lwzu 8, 4(7)
; CHECK-NEXT: addi 4, 29, -8
; CHECK-NEXT: li 5, 0
; CHECK-NEXT: lwzu 8, 4(7)
; CHECK-NEXT: bdz .LBB0_5
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: extswsli 6, 5, 5

View File

@ -10,17 +10,17 @@ define dso_local i32* @foo() local_unnamed_addr {
; CHECK-LABEL: foo:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addis r5, r2, x@toc@ha
; CHECK-NEXT: addi r5, r5, x@toc@l
; CHECK-NEXT: addi r5, r5, -8
; CHECK-NEXT: addis r6, r2, y@toc@ha
; CHECK-NEXT: li r7, 340
; CHECK-NEXT: addi r5, r5, x@toc@l
; CHECK-NEXT: addi r5, r5, -8
; CHECK-NEXT: addi r3, r6, y@toc@l
; CHECK-NEXT: lwz r6, y@toc@l(r6)
; CHECK-NEXT: mtctr r7
; CHECK-NEXT: addi r4, r3, -8
; CHECK-NEXT: lwzu r7, 12(r5)
; CHECK-NEXT: maddld r6, r7, r7, r6
; CHECK-NEXT: lwz r7, 4(r5)
; CHECK-NEXT: addi r4, r3, -8
; CHECK-NEXT: stwu r6, 12(r4)
; CHECK-NEXT: maddld r6, r7, r7, r6
; CHECK-NEXT: lwz r7, 8(r5)
@ -29,12 +29,12 @@ define dso_local i32* @foo() local_unnamed_addr {
; CHECK-NEXT: #
; CHECK-NEXT: maddld r7, r7, r7, r6
; CHECK-NEXT: lwzu r8, 12(r5)
; CHECK-NEXT: maddld r8, r8, r8, r7
; CHECK-NEXT: stw r6, 4(r4)
; CHECK-NEXT: lwz r6, 4(r5)
; CHECK-NEXT: maddld r6, r6, r6, r8
; CHECK-NEXT: maddld r8, r8, r8, r7
; CHECK-NEXT: stw r7, 8(r4)
; CHECK-NEXT: lwz r7, 8(r5)
; CHECK-NEXT: maddld r6, r6, r6, r8
; CHECK-NEXT: stwu r8, 12(r4)
; CHECK-NEXT: bdnz .LBB0_1
; CHECK-NEXT: # %bb.2:

View File

@ -12,8 +12,8 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P9LE-LABEL: fold_srem_vec_1:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, -21386
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 37253
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r4, r3, r4
@ -26,9 +26,9 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P9LE-NEXT: lis r4, 31710
; P9LE-NEXT: mtvsrd v3, r3
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: ori r4, r4, 63421
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: ori r4, r4, 63421
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: sub r4, r4, r3
; P9LE-NEXT: srwi r5, r4, 31
@ -39,21 +39,21 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P9LE-NEXT: lis r4, 21399
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: ori r4, r4, 33437
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: srawi r4, r4, 5
; P9LE-NEXT: add r4, r4, r5
; P9LE-NEXT: mulli r4, r4, 98
; P9LE-NEXT: sub r3, r3, r4
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: lis r4, -16728
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, -16728
; P9LE-NEXT: ori r4, r4, 63249
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: srwi r5, r4, 31
@ -69,8 +69,8 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P9BE-LABEL: fold_srem_vec_1:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r4, 31710
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r4, r4, 63421
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: mulhw r4, r3, r4
@ -82,11 +82,11 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, -21386
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: ori r4, r4, 37253
; P9BE-NEXT: mtvsrd v3, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: ori r4, r4, 37253
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: add r4, r4, r3
; P9BE-NEXT: srwi r5, r4, 31
@ -96,11 +96,12 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, -16728
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: ori r4, r4, 63249
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: ori r4, r4, 63249
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 8
@ -109,12 +110,11 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, 21399
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: ori r4, r4, 33437
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: ori r4, r4, 33437
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 5
@ -247,8 +247,8 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P9LE-LABEL: fold_srem_vec_2:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, -21386
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 37253
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r5, r3, r4
@ -272,6 +272,7 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r5, r3, r4
; P9LE-NEXT: add r5, r5, r3
@ -280,7 +281,6 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P9LE-NEXT: add r5, r5, r6
; P9LE-NEXT: mulli r5, r5, 95
; P9LE-NEXT: sub r3, r3, r5
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
@ -300,8 +300,8 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P9BE-LABEL: fold_srem_vec_2:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r4, -21386
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r4, r4, 37253
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: mulhw r5, r3, r4
@ -327,6 +327,7 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: mulhw r5, r3, r4
; P9BE-NEXT: add r5, r5, r3
@ -336,7 +337,6 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P9BE-NEXT: mulli r5, r5, 95
; P9BE-NEXT: sub r3, r3, r5
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
@ -468,8 +468,8 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9LE-LABEL: combine_srem_sdiv:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, -21386
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 37253
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r5, r3, r4
@ -493,6 +493,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: extsh r7, r3
; P9LE-NEXT: mulhw r8, r7, r4
; P9LE-NEXT: add r7, r8, r7
@ -501,7 +502,6 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9LE-NEXT: add r7, r7, r8
; P9LE-NEXT: mulli r8, r7, 95
; P9LE-NEXT: sub r3, r3, r8
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
@ -512,6 +512,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9LE-NEXT: srawi r4, r4, 6
; P9LE-NEXT: add r4, r4, r8
; P9LE-NEXT: mulli r8, r4, 95
; P9LE-NEXT: mtvsrd v5, r4
; P9LE-NEXT: sub r3, r3, r8
; P9LE-NEXT: mtvsrd v2, r3
; P9LE-NEXT: vmrghh v2, v2, v4
@ -520,7 +521,6 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9LE-NEXT: mtvsrd v3, r5
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r7
; P9LE-NEXT: mtvsrd v5, r4
; P9LE-NEXT: vmrghh v4, v5, v4
; P9LE-NEXT: vmrglw v3, v4, v3
; P9LE-NEXT: vadduhm v2, v2, v3
@ -529,8 +529,8 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9BE-LABEL: combine_srem_sdiv:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r5, -21386
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r5, r5, 37253
; P9BE-NEXT: extsh r4, r3
; P9BE-NEXT: mulhw r6, r4, r5
@ -556,6 +556,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: extsh r7, r3
; P9BE-NEXT: mulhw r8, r7, r5
; P9BE-NEXT: add r7, r8, r7
@ -565,7 +566,6 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9BE-NEXT: mulli r8, r7, 95
; P9BE-NEXT: sub r3, r3, r8
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
@ -747,9 +747,10 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
; P9LE-NEXT: lis r4, -21386
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: ori r4, r4, 37253
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: add r4, r4, r3
; P9LE-NEXT: srwi r5, r4, 31
@ -757,7 +758,6 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
; P9LE-NEXT: add r4, r4, r5
; P9LE-NEXT: mulli r4, r4, 95
; P9LE-NEXT: sub r3, r3, r4
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
@ -791,11 +791,12 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, -21386
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: ori r4, r4, 37253
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: ori r4, r4, 37253
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: add r4, r4, r3
; P9BE-NEXT: srwi r5, r4, 31
@ -804,7 +805,6 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
; P9BE-NEXT: mulli r4, r4, 95
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
@ -914,8 +914,8 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
; P9LE-LABEL: dont_fold_srem_one:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, -14230
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 30865
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r4, r3, r4
@ -928,11 +928,12 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
; P9LE-NEXT: lis r4, -19946
; P9LE-NEXT: mtvsrd v3, r3
; P9LE-NEXT: li r3, 0
; P9LE-NEXT: ori r4, r4, 17097
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: vmrghh v3, v3, v4
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: ori r4, r4, 17097
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: add r4, r4, r3
; P9LE-NEXT: srwi r5, r4, 31
@ -940,12 +941,11 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
; P9LE-NEXT: add r4, r4, r5
; P9LE-NEXT: mulli r4, r4, 23
; P9LE-NEXT: sub r3, r3, r4
; P9LE-NEXT: vmrghh v3, v3, v4
; P9LE-NEXT: lis r4, 24749
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, 24749
; P9LE-NEXT: ori r4, r4, 47143
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: srwi r5, r4, 31
@ -961,8 +961,8 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
; P9BE-LABEL: dont_fold_srem_one:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r4, -19946
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r4, r4, 17097
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: mulhw r4, r3, r4
@ -974,11 +974,11 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, 24749
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: ori r4, r4, 47143
; P9BE-NEXT: mtvsrd v3, r3
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: ori r4, r4, 47143
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 11
@ -987,11 +987,12 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, -14230
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: ori r4, r4, 30865
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: vmrghh v3, v3, v4
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: ori r4, r4, 30865
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: add r4, r4, r3
; P9BE-NEXT: srwi r5, r4, 31
@ -1003,7 +1004,6 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
; P9BE-NEXT: mtvsrd v2, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: vmrghh v3, v3, v4
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: vmrghh v2, v4, v2
; P9BE-NEXT: vmrghw v2, v2, v3
@ -1112,8 +1112,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; P9LE-LABEL: dont_fold_urem_i16_smax:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, -19946
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 17097
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: mulhw r4, r3, r4
@ -1126,9 +1126,9 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; P9LE-NEXT: lis r4, 24749
; P9LE-NEXT: mtvsrd v3, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: ori r4, r4, 47143
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: ori r4, r4, 47143
; P9LE-NEXT: mulhw r4, r3, r4
; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: srawi r4, r4, 11
@ -1138,6 +1138,7 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: extsh r3, r3
; P9LE-NEXT: srawi r4, r3, 15
; P9LE-NEXT: addze r4, r4
@ -1145,7 +1146,6 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; P9LE-NEXT: sub r3, r3, r4
; P9LE-NEXT: mtvsrd v2, r3
; P9LE-NEXT: li r3, 0
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: vmrghh v2, v2, v4
; P9LE-NEXT: vmrglw v2, v3, v2
@ -1154,8 +1154,8 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; P9BE-LABEL: dont_fold_urem_i16_smax:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r4, -19946
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r4, r4, 17097
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: mulhw r4, r3, r4
@ -1167,11 +1167,11 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, 24749
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: ori r4, r4, 47143
; P9BE-NEXT: mtvsrd v3, r3
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: ori r4, r4, 47143
; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 11
@ -1182,6 +1182,7 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: vmrghh v3, v3, v4
; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: srawi r4, r3, 15
; P9BE-NEXT: addze r4, r4
@ -1191,7 +1192,6 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; P9BE-NEXT: mtvsrd v2, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: vmrghh v3, v3, v4
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: vmrghh v2, v4, v2
; P9BE-NEXT: vmrghw v2, v2, v3
@ -1290,10 +1290,10 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) {
; P9LE-LABEL: dont_fold_srem_i64:
; P9LE: # %bb.0:
; P9LE-NEXT: lis r4, 24749
; P9LE-NEXT: mfvsrd r3, v3
; P9LE-NEXT: ori r4, r4, 47142
; P9LE-NEXT: sldi r4, r4, 32
; P9LE-NEXT: oris r4, r4, 58853
; P9LE-NEXT: mfvsrd r3, v3
; P9LE-NEXT: ori r4, r4, 6055
; P9LE-NEXT: mulhd r4, r3, r4
; P9LE-NEXT: rldicl r5, r4, 1, 63
@ -1316,10 +1316,10 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) {
; P9LE-NEXT: sub r4, r4, r5
; P9LE-NEXT: mtvsrdd v3, r3, r4
; P9LE-NEXT: lis r4, 25653
; P9LE-NEXT: mfvsrd r3, v2
; P9LE-NEXT: ori r4, r4, 15432
; P9LE-NEXT: sldi r4, r4, 32
; P9LE-NEXT: oris r4, r4, 1603
; P9LE-NEXT: mfvsrd r3, v2
; P9LE-NEXT: ori r4, r4, 21445
; P9LE-NEXT: mulhd r4, r3, r4
; P9LE-NEXT: rldicl r5, r4, 1, 63
@ -1334,10 +1334,10 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) {
; P9BE-LABEL: dont_fold_srem_i64:
; P9BE: # %bb.0:
; P9BE-NEXT: lis r4, 24749
; P9BE-NEXT: mfvsrld r3, v3
; P9BE-NEXT: ori r4, r4, 47142
; P9BE-NEXT: sldi r4, r4, 32
; P9BE-NEXT: oris r4, r4, 58853
; P9BE-NEXT: mfvsrld r3, v3
; P9BE-NEXT: ori r4, r4, 6055
; P9BE-NEXT: mulhd r4, r3, r4
; P9BE-NEXT: rldicl r5, r4, 1, 63
@ -1360,10 +1360,10 @@ define <4 x i64> @dont_fold_srem_i64(<4 x i64> %x) {
; P9BE-NEXT: sub r4, r4, r5
; P9BE-NEXT: mtvsrdd v3, r4, r3
; P9BE-NEXT: lis r4, 25653
; P9BE-NEXT: mfvsrld r3, v2
; P9BE-NEXT: ori r4, r4, 15432
; P9BE-NEXT: sldi r4, r4, 32
; P9BE-NEXT: oris r4, r4, 1603
; P9BE-NEXT: mfvsrld r3, v2
; P9BE-NEXT: ori r4, r4, 21445
; P9BE-NEXT: mulhd r4, r3, r4
; P9BE-NEXT: rldicl r5, r4, 1, 63

View File

@ -51,15 +51,15 @@ define i32 @foo(i32 %n) local_unnamed_addr #0 "stack-probe-size"="32768" nounwin
; CHECK-P9-LE-NEXT: stdu r1, -48(r1)
; CHECK-P9-LE-NEXT: rldic r3, r3, 2, 30
; CHECK-P9-LE-NEXT: addi r3, r3, 15
; CHECK-P9-LE-NEXT: li r6, -32768
; CHECK-P9-LE-NEXT: mr r31, r1
; CHECK-P9-LE-NEXT: addi r4, r31, 48
; CHECK-P9-LE-NEXT: rldicl r3, r3, 60, 4
; CHECK-P9-LE-NEXT: rldicl r3, r3, 4, 29
; CHECK-P9-LE-NEXT: neg r5, r3
; CHECK-P9-LE-NEXT: li r6, -32768
; CHECK-P9-LE-NEXT: divd r7, r5, r6
; CHECK-P9-LE-NEXT: mulld r6, r7, r6
; CHECK-P9-LE-NEXT: mr r31, r1
; CHECK-P9-LE-NEXT: addi r4, r31, 48
; CHECK-P9-LE-NEXT: add r3, r1, r5
; CHECK-P9-LE-NEXT: mulld r6, r7, r6
; CHECK-P9-LE-NEXT: sub r5, r5, r6
; CHECK-P9-LE-NEXT: stdux r4, r1, r5
; CHECK-P9-LE-NEXT: cmpd r1, r3
@ -69,8 +69,8 @@ define i32 @foo(i32 %n) local_unnamed_addr #0 "stack-probe-size"="32768" nounwin
; CHECK-P9-LE-NEXT: cmpd r1, r3
; CHECK-P9-LE-NEXT: bne cr0, .LBB0_1
; CHECK-P9-LE-NEXT: .LBB0_2:
; CHECK-P9-LE-NEXT: addi r3, r1, 32
; CHECK-P9-LE-NEXT: li r4, 1
; CHECK-P9-LE-NEXT: addi r3, r1, 32
; CHECK-P9-LE-NEXT: stw r4, 4792(r3)
; CHECK-P9-LE-NEXT: lwz r3, 0(r3)
; CHECK-P9-LE-NEXT: ld r1, 0(r1)
@ -190,15 +190,15 @@ define i32 @bar(i32 %n) local_unnamed_addr #0 nounwind {
; CHECK-P9-LE-NEXT: stdu r1, -48(r1)
; CHECK-P9-LE-NEXT: rldic r4, r3, 2, 30
; CHECK-P9-LE-NEXT: addi r4, r4, 15
; CHECK-P9-LE-NEXT: li r7, -4096
; CHECK-P9-LE-NEXT: mr r31, r1
; CHECK-P9-LE-NEXT: addi r5, r31, 48
; CHECK-P9-LE-NEXT: rldicl r4, r4, 60, 4
; CHECK-P9-LE-NEXT: rldicl r4, r4, 4, 29
; CHECK-P9-LE-NEXT: neg r6, r4
; CHECK-P9-LE-NEXT: li r7, -4096
; CHECK-P9-LE-NEXT: divd r8, r6, r7
; CHECK-P9-LE-NEXT: mulld r7, r8, r7
; CHECK-P9-LE-NEXT: mr r31, r1
; CHECK-P9-LE-NEXT: addi r5, r31, 48
; CHECK-P9-LE-NEXT: add r4, r1, r6
; CHECK-P9-LE-NEXT: mulld r7, r8, r7
; CHECK-P9-LE-NEXT: sub r6, r6, r7
; CHECK-P9-LE-NEXT: stdux r5, r1, r6
; CHECK-P9-LE-NEXT: cmpd r1, r4
@ -208,10 +208,10 @@ define i32 @bar(i32 %n) local_unnamed_addr #0 nounwind {
; CHECK-P9-LE-NEXT: cmpd r1, r4
; CHECK-P9-LE-NEXT: bne cr0, .LBB1_1
; CHECK-P9-LE-NEXT: .LBB1_2:
; CHECK-P9-LE-NEXT: addi r4, r1, 32
; CHECK-P9-LE-NEXT: extswsli r3, r3, 2
; CHECK-P9-LE-NEXT: add r3, r4, r3
; CHECK-P9-LE-NEXT: li r5, 1
; CHECK-P9-LE-NEXT: addi r4, r1, 32
; CHECK-P9-LE-NEXT: add r3, r4, r3
; CHECK-P9-LE-NEXT: stw r5, 4096(r3)
; CHECK-P9-LE-NEXT: lwz r3, 0(r4)
; CHECK-P9-LE-NEXT: ld r1, 0(r1)
@ -334,16 +334,16 @@ define i32 @f(i32 %n) local_unnamed_addr #0 "stack-probe-size"="65536" nounwind
; CHECK-P9-LE-NEXT: stdu r1, -48(r1)
; CHECK-P9-LE-NEXT: rldic r3, r3, 2, 30
; CHECK-P9-LE-NEXT: addi r3, r3, 15
; CHECK-P9-LE-NEXT: rldicl r3, r3, 60, 4
; CHECK-P9-LE-NEXT: rldicl r3, r3, 4, 29
; CHECK-P9-LE-NEXT: lis r5, -1
; CHECK-P9-LE-NEXT: ori r5, r5, 0
; CHECK-P9-LE-NEXT: neg r6, r3
; CHECK-P9-LE-NEXT: divd r7, r6, r5
; CHECK-P9-LE-NEXT: mulld r7, r7, r5
; CHECK-P9-LE-NEXT: mr r31, r1
; CHECK-P9-LE-NEXT: addi r4, r31, 48
; CHECK-P9-LE-NEXT: rldicl r3, r3, 60, 4
; CHECK-P9-LE-NEXT: rldicl r3, r3, 4, 29
; CHECK-P9-LE-NEXT: neg r6, r3
; CHECK-P9-LE-NEXT: divd r7, r6, r5
; CHECK-P9-LE-NEXT: add r3, r1, r6
; CHECK-P9-LE-NEXT: mulld r7, r7, r5
; CHECK-P9-LE-NEXT: sub r6, r6, r7
; CHECK-P9-LE-NEXT: stdux r4, r1, r6
; CHECK-P9-LE-NEXT: cmpd r1, r3
@ -353,8 +353,8 @@ define i32 @f(i32 %n) local_unnamed_addr #0 "stack-probe-size"="65536" nounwind
; CHECK-P9-LE-NEXT: cmpd r1, r3
; CHECK-P9-LE-NEXT: bne cr0, .LBB2_1
; CHECK-P9-LE-NEXT: .LBB2_2:
; CHECK-P9-LE-NEXT: addi r3, r1, 32
; CHECK-P9-LE-NEXT: li r4, 1
; CHECK-P9-LE-NEXT: addi r3, r1, 32
; CHECK-P9-LE-NEXT: stw r4, 4792(r3)
; CHECK-P9-LE-NEXT: lwz r3, 0(r3)
; CHECK-P9-LE-NEXT: ld r1, 0(r1)

View File

@ -0,0 +1,18 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -run-pass=postmisched -o - %s | FileCheck %s
---
# Check that postmisched's TopDepthReduce heuristic moves the MULLD later
# because of the dependency on x5
name: test
body: |
bb.0:
; CHECK-LABEL: name: test
; CHECK: renamable $x5 = LD 0, killed renamable $x5 :: (load 8)
; CHECK: renamable $x4 = LD 0, killed renamable $x4 :: (load 8)
; CHECK: renamable $x5 = MULLD killed renamable $x5, renamable $x3
; CHECK: renamable $x3 = MADDLD8 killed renamable $x4, killed renamable $x3, killed renamable $x5
renamable $x5 = LD 0, killed renamable $x5 :: (load 8)
renamable $x5 = MULLD killed renamable $x5, renamable $x3
renamable $x4 = LD 0, killed renamable $x4 :: (load 8)
renamable $x3 = MADDLD8 killed renamable $x4, killed renamable $x3, killed renamable $x5
...

View File

@ -20,9 +20,9 @@ define dso_local <2 x double> @test1(<8 x i16> %a) {
; P9BE-NEXT: mtfprwz f0, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: xscvuxddp f0, f0
; P9BE-NEXT: clrlwi r3, r3, 16
; P9BE-NEXT: mtfprwz f1, r3
; P9BE-NEXT: xscvuxddp f0, f0
; P9BE-NEXT: xscvuxddp f1, f1
; P9BE-NEXT: xxmrghd v2, vs0, vs1
; P9BE-NEXT: blr
@ -35,9 +35,9 @@ define dso_local <2 x double> @test1(<8 x i16> %a) {
; P9LE-NEXT: mtfprwz f0, r3
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: xscvuxddp f0, f0
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: mtfprwz f1, r3
; P9LE-NEXT: xscvuxddp f0, f0
; P9LE-NEXT: xscvuxddp f1, f1
; P9LE-NEXT: xxmrghd v2, vs1, vs0
; P9LE-NEXT: blr

View File

@ -6,8 +6,8 @@ define i8 @test_xaddr(i8* %p) {
; CHECK-LABEL: test_xaddr:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: li r4, 0
; CHECK-NEXT: ori r4, r4, 40000
; CHECK-NEXT: std r3, -8(r1)
; CHECK-NEXT: ori r4, r4, 40000
; CHECK-NEXT: lbzx r3, r3, r4
; CHECK-NEXT: blr
entry:
@ -56,8 +56,8 @@ define void @test_xoaddr(i32* %arr, i32* %arrTo) {
; CHECK-LABEL: test_xoaddr:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addi r3, r3, 8
; CHECK-NEXT: lxvx vs0, 0, r3
; CHECK-NEXT: addi r4, r4, 4
; CHECK-NEXT: lxvx vs0, 0, r3
; CHECK-NEXT: stxvx vs0, 0, r4
; CHECK-NEXT: blr
entry:
@ -77,9 +77,9 @@ define i64 @test_xaddrX4_loop(i8* %p) {
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: addi r4, r3, -8
; CHECK-NEXT: li r3, 8
; CHECK-NEXT: li r5, 3
; CHECK-NEXT: mtctr r3
; CHECK-NEXT: li r3, 0
; CHECK-NEXT: li r5, 3
; loop instruction number is changed from 5 to 4, so its align is changed from 5 to 4.
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: .LBB4_1: # %for.body

View File

@ -12,9 +12,11 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; P9LE-LABEL: fold_urem_vec_1:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, 21399
; P9LE-NEXT: lis r5, 8456
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 33437
; P9LE-NEXT: ori r5, r5, 16913
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: mulhwu r4, r3, r4
; P9LE-NEXT: srwi r4, r4, 5
@ -23,9 +25,9 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; P9LE-NEXT: lis r4, 16727
; P9LE-NEXT: mtvsrd v3, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: ori r4, r4, 2287
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: ori r4, r4, 2287
; P9LE-NEXT: mulhwu r4, r3, r4
; P9LE-NEXT: srwi r4, r4, 8
; P9LE-NEXT: mulli r4, r4, 1003
@ -33,8 +35,6 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r5, 8456
; P9LE-NEXT: ori r5, r5, 16913
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: clrlwi r4, r3, 16
; P9LE-NEXT: rlwinm r3, r3, 30, 18, 31
@ -45,9 +45,9 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; P9LE-NEXT: lis r4, 22765
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 0
; P9LE-NEXT: ori r4, r4, 8969
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: ori r4, r4, 8969
; P9LE-NEXT: mulhwu r4, r3, r4
; P9LE-NEXT: sub r5, r3, r4
; P9LE-NEXT: srwi r5, r5, 1
@ -63,9 +63,11 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; P9BE-LABEL: fold_urem_vec_1:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r4, 16727
; P9BE-NEXT: lis r5, 8456
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r4, r4, 2287
; P9BE-NEXT: ori r5, r5, 16913
; P9BE-NEXT: clrlwi r3, r3, 16
; P9BE-NEXT: mulhwu r4, r3, r4
; P9BE-NEXT: srwi r4, r4, 8
@ -73,11 +75,11 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, 21399
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: ori r4, r4, 33437
; P9BE-NEXT: mtvsrd v3, r3
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: clrlwi r3, r3, 16
; P9BE-NEXT: ori r4, r4, 33437
; P9BE-NEXT: mulhwu r4, r3, r4
; P9BE-NEXT: srwi r4, r4, 5
; P9BE-NEXT: mulli r4, r4, 98
@ -86,8 +88,6 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r5, 8456
; P9BE-NEXT: ori r5, r5, 16913
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: clrlwi r4, r3, 16
; P9BE-NEXT: rlwinm r3, r3, 30, 18, 31
@ -97,11 +97,11 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; P9BE-NEXT: sub r3, r4, r3
; P9BE-NEXT: lis r4, 22765
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: ori r4, r4, 8969
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: clrlwi r3, r3, 16
; P9BE-NEXT: ori r4, r4, 8969
; P9BE-NEXT: mulhwu r4, r3, r4
; P9BE-NEXT: sub r5, r3, r4
; P9BE-NEXT: srwi r5, r5, 1
@ -223,8 +223,8 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; P9LE-LABEL: fold_urem_vec_2:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, 22765
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 8969
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: mulhwu r5, r3, r4
@ -248,6 +248,7 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: mulhwu r5, r3, r4
; P9LE-NEXT: sub r6, r3, r5
@ -256,7 +257,6 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; P9LE-NEXT: srwi r5, r5, 6
; P9LE-NEXT: mulli r5, r5, 95
; P9LE-NEXT: sub r3, r3, r5
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
@ -276,8 +276,8 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; P9BE-LABEL: fold_urem_vec_2:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r4, 22765
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r4, r4, 8969
; P9BE-NEXT: clrlwi r3, r3, 16
; P9BE-NEXT: mulhwu r5, r3, r4
@ -303,6 +303,7 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: clrlwi r3, r3, 16
; P9BE-NEXT: mulhwu r5, r3, r4
; P9BE-NEXT: sub r6, r3, r5
@ -312,7 +313,6 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; P9BE-NEXT: mulli r5, r5, 95
; P9BE-NEXT: sub r3, r3, r5
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
@ -444,8 +444,8 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; P9LE-LABEL: combine_urem_udiv:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, 22765
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 8969
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: mulhwu r5, r3, r4
@ -469,6 +469,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: clrlwi r7, r3, 16
; P9LE-NEXT: mulhwu r8, r7, r4
; P9LE-NEXT: sub r7, r7, r8
@ -477,7 +478,6 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; P9LE-NEXT: srwi r7, r7, 6
; P9LE-NEXT: mulli r8, r7, 95
; P9LE-NEXT: sub r3, r3, r8
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
@ -488,6 +488,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; P9LE-NEXT: add r4, r8, r4
; P9LE-NEXT: srwi r4, r4, 6
; P9LE-NEXT: mulli r8, r4, 95
; P9LE-NEXT: mtvsrd v5, r4
; P9LE-NEXT: sub r3, r3, r8
; P9LE-NEXT: mtvsrd v2, r3
; P9LE-NEXT: vmrghh v2, v2, v4
@ -496,7 +497,6 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; P9LE-NEXT: mtvsrd v3, r5
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: mtvsrd v4, r7
; P9LE-NEXT: mtvsrd v5, r4
; P9LE-NEXT: vmrghh v4, v5, v4
; P9LE-NEXT: vmrglw v3, v4, v3
; P9LE-NEXT: vadduhm v2, v2, v3
@ -505,8 +505,8 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; P9BE-LABEL: combine_urem_udiv:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r5, 22765
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r5, r5, 8969
; P9BE-NEXT: clrlwi r4, r3, 16
; P9BE-NEXT: mulhwu r6, r4, r5
@ -532,6 +532,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: clrlwi r7, r3, 16
; P9BE-NEXT: mulhwu r8, r7, r5
; P9BE-NEXT: sub r7, r7, r8
@ -541,7 +542,6 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; P9BE-NEXT: mulli r8, r7, 95
; P9BE-NEXT: sub r3, r3, r8
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
@ -708,7 +708,9 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; P9LE-LABEL: dont_fold_urem_power_of_two:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
; P9LE-NEXT: lis r4, 22765
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 8969
; P9LE-NEXT: clrlwi r3, r3, 26
; P9LE-NEXT: mtvsrd v3, r3
; P9LE-NEXT: li r3, 2
@ -717,8 +719,6 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, 22765
; P9LE-NEXT: ori r4, r4, 8969
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: mulhwu r4, r3, r4
@ -740,7 +740,9 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; P9BE-LABEL: dont_fold_urem_power_of_two:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: lis r4, 22765
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r4, r4, 8969
; P9BE-NEXT: clrlwi r3, r3, 27
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: mtvsrd v3, r3
@ -751,8 +753,6 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r4, 22765
; P9BE-NEXT: ori r4, r4, 8969
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: clrlwi r3, r3, 16
; P9BE-NEXT: mulhwu r4, r3, r4
@ -844,9 +844,11 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; P9LE-LABEL: dont_fold_urem_one:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r4, -19946
; P9LE-NEXT: lis r5, -14230
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: ori r4, r4, 17097
; P9LE-NEXT: ori r5, r5, 30865
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: mulhwu r4, r3, r4
; P9LE-NEXT: srwi r4, r4, 4
@ -855,9 +857,9 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; P9LE-NEXT: lis r4, 24749
; P9LE-NEXT: mtvsrd v3, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: ori r4, r4, 47143
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: clrlwi r3, r3, 16
; P9LE-NEXT: ori r4, r4, 47143
; P9LE-NEXT: mulhwu r4, r3, r4
; P9LE-NEXT: srwi r4, r4, 11
; P9LE-NEXT: mulli r4, r4, 5423
@ -865,8 +867,6 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; P9LE-NEXT: mtvsrd v4, r3
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r5, -14230
; P9LE-NEXT: ori r5, r5, 30865
; P9LE-NEXT: vmrghh v3, v4, v3
; P9LE-NEXT: clrlwi r4, r3, 16
; P9LE-NEXT: rlwinm r3, r3, 31, 17, 31
@ -884,9 +884,11 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; P9BE-LABEL: dont_fold_urem_one:
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r4, 24749
; P9BE-NEXT: lis r5, -14230
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: ori r4, r4, 47143
; P9BE-NEXT: ori r5, r5, 30865
; P9BE-NEXT: clrlwi r3, r3, 16
; P9BE-NEXT: mulhwu r4, r3, r4
; P9BE-NEXT: srwi r4, r4, 11
@ -894,11 +896,11 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; P9BE-NEXT: sub r3, r3, r4
; P9BE-NEXT: lis r4, -19946
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: ori r4, r4, 17097
; P9BE-NEXT: mtvsrd v3, r3
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: clrlwi r3, r3, 16
; P9BE-NEXT: ori r4, r4, 17097
; P9BE-NEXT: mulhwu r4, r3, r4
; P9BE-NEXT: srwi r4, r4, 4
; P9BE-NEXT: mulli r4, r4, 23
@ -907,8 +909,6 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: lis r5, -14230
; P9BE-NEXT: ori r5, r5, 30865
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: clrlwi r4, r3, 16
; P9BE-NEXT: rlwinm r3, r3, 31, 17, 31
@ -1023,10 +1023,10 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; P9LE-LABEL: dont_fold_urem_i64:
; P9LE: # %bb.0:
; P9LE-NEXT: lis r4, 25644
; P9LE-NEXT: mfvsrld r3, v3
; P9LE-NEXT: ori r4, r4, 34192
; P9LE-NEXT: sldi r4, r4, 32
; P9LE-NEXT: oris r4, r4, 45590
; P9LE-NEXT: mfvsrld r3, v3
; P9LE-NEXT: ori r4, r4, 17097
; P9LE-NEXT: mulhdu r4, r3, r4
; P9LE-NEXT: sub r5, r3, r4
@ -1047,9 +1047,9 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; P9LE-NEXT: sub r4, r4, r5
; P9LE-NEXT: lis r5, 25653
; P9LE-NEXT: ori r5, r5, 15432
; P9LE-NEXT: sldi r5, r5, 32
; P9LE-NEXT: mtvsrdd v3, r4, r3
; P9LE-NEXT: mfvsrd r3, v2
; P9LE-NEXT: sldi r5, r5, 32
; P9LE-NEXT: rldicl r4, r3, 63, 1
; P9LE-NEXT: oris r5, r5, 1603
; P9LE-NEXT: ori r5, r5, 21445
@ -1064,10 +1064,10 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; P9BE-LABEL: dont_fold_urem_i64:
; P9BE: # %bb.0:
; P9BE-NEXT: lis r4, 25644
; P9BE-NEXT: mfvsrd r3, v3
; P9BE-NEXT: ori r4, r4, 34192
; P9BE-NEXT: sldi r4, r4, 32
; P9BE-NEXT: oris r4, r4, 45590
; P9BE-NEXT: mfvsrd r3, v3
; P9BE-NEXT: ori r4, r4, 17097
; P9BE-NEXT: mulhdu r4, r3, r4
; P9BE-NEXT: sub r5, r3, r4
@ -1075,8 +1075,8 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; P9BE-NEXT: add r4, r5, r4
; P9BE-NEXT: lis r5, -16037
; P9BE-NEXT: rldicl r4, r4, 60, 4
; P9BE-NEXT: mulli r4, r4, 23
; P9BE-NEXT: ori r5, r5, 28749
; P9BE-NEXT: mulli r4, r4, 23
; P9BE-NEXT: sldi r5, r5, 32
; P9BE-NEXT: oris r5, r5, 52170
; P9BE-NEXT: ori r5, r5, 12109
@ -1088,9 +1088,9 @@ define <4 x i64> @dont_fold_urem_i64(<4 x i64> %x) {
; P9BE-NEXT: sub r4, r4, r5
; P9BE-NEXT: lis r5, 25653
; P9BE-NEXT: ori r5, r5, 15432
; P9BE-NEXT: sldi r5, r5, 32
; P9BE-NEXT: mtvsrdd v3, r3, r4
; P9BE-NEXT: mfvsrld r3, v2
; P9BE-NEXT: sldi r5, r5, 32
; P9BE-NEXT: rldicl r4, r3, 63, 1
; P9BE-NEXT: oris r5, r5, 1603
; P9BE-NEXT: ori r5, r5, 21445

View File

@ -138,8 +138,8 @@ define <8 x i16> @test_v8i16_sign_negative(<8 x i16> %m, <8 x i16> %n) {
; CHECK-P9-LABEL: test_v8i16_sign_negative:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addis 3, 2, .LCPI6_0@toc@ha
; CHECK-P9-NEXT: addi 3, 3, .LCPI6_0@toc@l
; CHECK-P9-NEXT: vadduhm 2, 2, 3
; CHECK-P9-NEXT: addi 3, 3, .LCPI6_0@toc@l
; CHECK-P9-NEXT: lxvx 35, 0, 3
; CHECK-P9-NEXT: vadduhm 2, 2, 3
; CHECK-P9-NEXT: vspltish 3, 1

View File

@ -3,7 +3,8 @@
define dso_local void @test(i32* %Arr, i32 signext %Len) {
; CHECK-LABEL: test:
; CHECK: lxvx [[REG:vs[0-9]+]], r{{[0-9]+}}, r{{[0-9]+}}
; CHECK-NEXT: xxbrw vs{{[0-9]+}}, [[REG]]
; CHECK-NOT: [[REG]]
; CHECK: xxbrw vs{{[0-9]+}}, [[REG]]
entry:
%cmp1 = icmp slt i32 0, %Len
br i1 %cmp1, label %for.body.lr.ph, label %for.cond.cleanup

Some files were not shown because too many files have changed in this diff Show More