1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-26 12:43:36 +01:00
llvm-mirror/test/CodeGen/AMDGPU/load-global-i16.ll
Baptiste Saleil 8c45c9b8dd [VirtRegRewriter] Insert missing killed flags when tracking subregister liveness
VirtRegRewriter may sometimes fail to correctly apply the kill flag where necessary,
which causes unecessary code gen on PowerPC. This patch fixes the way masks for
defined lanes are computed and the way mask for used lanes is computed.

Contact albion.fung@ibm.com instead of author for problems related to this commit.

Differential Revision: https://reviews.llvm.org/D92405
2021-03-03 12:02:04 -05:00

8754 lines
397 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-HSA %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=EG %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=CM %s
; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
define amdgpu_kernel void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
; GCN-NOHSA-SI-LABEL: global_load_i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: flat_load_ushort v2, v[2:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_short v[0:1], v2
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_load_i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: buffer_store_short v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_i16:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x,
; EG-NEXT: AND_INT * T1.W, T0.X, literal.y,
; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
; EG-NEXT: LSHL * T0.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T0.X, T1.W, PV.W,
; EG-NEXT: LSHL * T0.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T0.Y, 0.0,
; EG-NEXT: MOV * T0.Z, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_load_i16:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 11, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: AND_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T0.X, literal.x,
; CM-NEXT: LSHL * T0.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; CM-NEXT: LSHL T0.X, PV.Z, PV.W,
; CM-NEXT: LSHL * T0.W, literal.x, PV.W,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV T0.Y, 0.0,
; CM-NEXT: MOV * T0.Z, 0.0,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%ld = load i16, i16 addrspace(1)* %in
store i16 %ld, i16 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
; GCN-NOHSA-SI-LABEL: global_load_v2i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_v2i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: flat_load_dword v2, v[2:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_dword v[0:1], v2
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_load_v2i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v2i16:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_load_v2i16:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
; GCN-NOHSA-SI-LABEL: global_load_v3i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_short v1, off, s[4:7], 0 offset:4
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_v3i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 4
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_short v[4:5], v1
; GCN-HSA-NEXT: flat_store_dword v[2:3], v0
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_load_v3i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: buffer_store_short v1, off, s[0:3], 0 offset:4
; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v3i16:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 2 @6
; EG-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 0
; EG-NEXT: MEM_RAT MSKOR T5.XW, T8.X
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T6.X, T5.X, 0, #1
; EG-NEXT: VTX_READ_16 T7.X, T5.X, 2, #1
; EG-NEXT: VTX_READ_16 T5.X, T5.X, 4, #1
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 13:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; EG-NEXT: AND_INT T1.W, PV.W, literal.x,
; EG-NEXT: AND_INT * T2.W, T5.X, literal.y,
; EG-NEXT: 3(4.203895e-45), 65535(9.183409e-41)
; EG-NEXT: LSHL * T1.W, PV.W, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: LSHL T5.X, T2.W, PV.W,
; EG-NEXT: LSHL * T5.W, literal.x, PV.W,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: MOV T5.Y, 0.0,
; EG-NEXT: MOV * T5.Z, 0.0,
; EG-NEXT: LSHR T8.X, T0.W, literal.x,
; EG-NEXT: LSHL T0.W, T7.X, literal.y,
; EG-NEXT: AND_INT * T1.W, T6.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: OR_INT T6.X, PV.W, PS,
; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_load_v3i16:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 2 @6
; CM-NEXT: ALU 19, @13, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT MSKOR T5.XW, T8.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6.X, T7.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T6.X, T5.X, 0, #1
; CM-NEXT: VTX_READ_16 T7.X, T5.X, 2, #1
; CM-NEXT: VTX_READ_16 T5.X, T5.X, 4, #1
; CM-NEXT: ALU clause starting at 12:
; CM-NEXT: MOV * T5.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 13:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 4(5.605194e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT * T1.W, PV.W, literal.x,
; CM-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; CM-NEXT: AND_INT T0.Z, T5.X, literal.x,
; CM-NEXT: LSHL * T1.W, PV.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; CM-NEXT: LSHL T5.X, PV.Z, PV.W,
; CM-NEXT: LSHL * T5.W, literal.x, PV.W,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV T5.Y, 0.0,
; CM-NEXT: MOV * T5.Z, 0.0,
; CM-NEXT: LSHL T0.Z, T7.X, literal.x,
; CM-NEXT: AND_INT * T1.W, T6.X, literal.y, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 65535(9.183409e-41)
; CM-NEXT: OR_INT * T6.X, PV.Z, PV.W,
; CM-NEXT: LSHR * T7.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: LSHR * T8.X, T0.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
; GCN-NOHSA-SI-LABEL: global_load_v4i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_v4i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_load_v4i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v4i16:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_load_v4i16:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) {
; GCN-NOHSA-SI-LABEL: global_load_v8i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_v8i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_load_v8i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v8i16:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_load_v8i16:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%ld = load <8 x i16>, <8 x i16> addrspace(1)* %in
store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) {
; GCN-NOHSA-SI-LABEL: global_load_v16i16:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_v16i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_load_v16i16:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v16i16:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
; EG-NEXT: ALU 1, @13, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T2.X, 0
; EG-NEXT: TEX 0 @10
; EG-NEXT: ALU 3, @15, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1
; EG-NEXT: Fetch clause starting at 10:
; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 13:
; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ALU clause starting at 15:
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T1.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_load_v16i16:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @8
; CM-NEXT: ALU 1, @13, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
; CM-NEXT: TEX 0 @10
; CM-NEXT: ALU 3, @15, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 8:
; CM-NEXT: VTX_READ_128 T1.XYZW, T0.X, 0, #1
; CM-NEXT: Fetch clause starting at 10:
; CM-NEXT: VTX_READ_128 T0.XYZW, T0.X, 16, #1
; CM-NEXT: ALU clause starting at 12:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 13:
; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: ALU clause starting at 15:
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%ld = load <16 x i16>, <16 x i16> addrspace(1)* %in
store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_load_v16i16_align2(<16 x i16> addrspace(1)* %in, <16 x i16> addrspace(1)* %out) #0 {
; GCN-NOHSA-SI-LABEL: global_load_v16i16_align2:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v1, off, s[4:7], 0 offset:2
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v4, off, s[4:7], 0 offset:4
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v2, off, s[4:7], 0 offset:6
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v5, off, s[4:7], 0 offset:8
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v3, off, s[4:7], 0 offset:10
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v6, off, s[4:7], 0 offset:12
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v7, off, s[4:7], 0 offset:14
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v8, off, s[4:7], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v9, off, s[4:7], 0 offset:18
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v10, off, s[4:7], 0 offset:20
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v11, off, s[4:7], 0 offset:22
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v12, off, s[4:7], 0 offset:24
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v13, off, s[4:7], 0 offset:26
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v14, off, s[4:7], 0 offset:28
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v15, off, s[4:7], 0 offset:30
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(8)
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v16, 16, v3
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GCN-NOHSA-SI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v3, v7, v6
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v2, v16, v5
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v1, v17, v4
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v0, v18, v0
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v7, v15, v14
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v6, v13, v12
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v5, v11, v10
; GCN-NOHSA-SI-NEXT: v_or_b32_e32 v4, v9, v8
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_load_v16i16_align2:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: s_add_u32 s0, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: s_add_u32 s0, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s1, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_load_v16i16_align2:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:2
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:4
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:6
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v5, off, s[0:3], 0 offset:8
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v3, off, s[0:3], 0 offset:10
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v6, off, s[0:3], 0 offset:12
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v7, off, s[0:3], 0 offset:14
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v8, off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v9, off, s[0:3], 0 offset:18
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v10, off, s[0:3], 0 offset:20
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v11, off, s[0:3], 0 offset:22
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v12, off, s[0:3], 0 offset:24
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v13, off, s[0:3], 0 offset:26
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v14, off, s[0:3], 0 offset:28
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v15, off, s[0:3], 0 offset:30
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s7
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(14)
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v18, 16, v1
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v0, v18, v0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(12)
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v17, 16, v2
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v1, v17, v4
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(10)
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v16, 16, v3
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v2, v16, v5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(8)
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v3, v7, v6
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6)
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v9, 16, v9
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v4, v9, v8
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4)
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v5, v11, v10
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2)
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v13, 16, v13
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v6, v13, v12
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshlrev_b32_e32 v15, 16, v15
; GCN-NOHSA-VI-NEXT: v_or_b32_e32 v7, v15, v14
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_load_v16i16_align2:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @6
; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T3.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T2.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T0.X, KC0[2].Y,
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: LSHR T2.X, KC0[2].Z, literal.x,
; EG-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_load_v16i16_align2:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 1 @6
; CM-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T3.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1, T2.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1
; CM-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 10:
; CM-NEXT: MOV * T0.X, KC0[2].Y,
; CM-NEXT: ALU clause starting at 11:
; CM-NEXT: ADD_INT * T2.W, KC0[2].Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T2.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: LSHR * T3.X, KC0[2].Z, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%ld = load <16 x i16>, <16 x i16> addrspace(1)* %in, align 2
store <16 x i16> %ld, <16 x i16> addrspace(1)* %out, align 32
ret void
}
define amdgpu_kernel void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_i16_to_i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: flat_load_ushort v2, v[2:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_dword v[0:1], v2
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_i16_to_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_zextload_i16_to_i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%a = load i16, i16 addrspace(1)* %in
%ext = zext i16 %a to i32
store i32 %ext, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_sextload_i16_to_i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_sshort v0, off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: flat_load_sshort v2, v[2:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_dword v[0:1], v2
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3
; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[4:7], 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_i16_to_i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
;
; CM-LABEL: global_sextload_i16_to_i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%a = load i16, i16 addrspace(1)* %in
%ext = sext i16 %a to i32
store i32 %ext, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_v1i16_to_v1i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: flat_load_ushort v2, v[2:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_dword v[0:1], v2
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v1i16_to_v1i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_zextload_v1i16_to_v1i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 1, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <1 x i16>, <1 x i16> addrspace(1)* %in
%ext = zext <1 x i16> %load to <1 x i32>
store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_sextload_v1i16_to_v1i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_sshort v0, off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: flat_load_sshort v2, v[2:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_dword v[0:1], v2
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3
; GCN-NOHSA-VI-NEXT: buffer_load_sshort v0, off, s[4:7], 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v1i16_to_v1i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
;
; CM-LABEL: global_sextload_v1i16_to_v1i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0.X, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <1 x i16>, <1 x i16> addrspace(1)* %in
%ext = sext <1 x i16> %load to <1 x i32>
store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_v2i16_to_v2i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dword v2, v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GCN-HSA-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v2i16_to_v2i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XY, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T4.Y, T4.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
;
; CM-LABEL: global_zextload_v2i16_to_v2i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T4.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: LSHR * T4.Y, T4.X, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT * T4.X, T4.X, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <2 x i16>, <2 x i16> addrspace(1)* %in
%ext = zext <2 x i16> %load to <2 x i32>
store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
ret void
}
; TODO: This should use ASHR instead of LSHR + BFE
define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_sextload_v2i16_to_v2i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dword v2, v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v2
; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v2i16_to_v2i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XY, T4.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
; EG-NEXT: LSHR T0.W, T4.X, literal.x,
; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
; EG-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
;
; CM-LABEL: global_sextload_v2i16_to_v2i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 5, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T4.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T4.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: BFE_INT T5.X, T4.X, 0.0, literal.x,
; CM-NEXT: LSHR * T0.W, T4.X, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
; CM-NEXT: BFE_INT * T5.Y, PV.W, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
%load = load <2 x i16>, <2 x i16> addrspace(1)* %in
%ext = sext <2 x i16> %load to <2 x i32>
store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
; GCN-NOHSA-SI-LABEL: global_zextload_v3i16_to_v3i32:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s2, v0
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s2, v1
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx2 v[3:4], v[0:1]
; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v3
; GCN-HSA-NEXT: v_and_b32_e32 v2, s2, v4
; GCN-HSA-NEXT: v_and_b32_e32 v0, s2, v3
; GCN-HSA-NEXT: flat_store_dwordx3 v[5:6], v[0:2]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v3i16_to_v3i32:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s6, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s6, v0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v3i16_to_v3i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 4, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 2 @6
; EG-NEXT: ALU 2, @17, KC0[], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T0.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T2.X, T1.X, 4, #1
; EG-NEXT: VTX_READ_16 T3.X, T1.X, 0, #1
; EG-NEXT: VTX_READ_16 T1.X, T1.X, 2, #1
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
; EG-NEXT: MOV * T1.X, KC0[2].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; EG-NEXT: ALU clause starting at 17:
; EG-NEXT: LSHR T4.X, T0.W, literal.x,
; EG-NEXT: MOV * T3.Y, T1.X,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_zextload_v3i16_to_v3i32:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 4, @12, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 2 @6
; CM-NEXT: ALU 2, @17, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T3, T4.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T2.X, T0.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T2.X, T1.X, 4, #1
; CM-NEXT: VTX_READ_16 T3.X, T1.X, 0, #1
; CM-NEXT: VTX_READ_16 T1.X, T1.X, 2, #1
; CM-NEXT: ALU clause starting at 12:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 8(1.121039e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T0.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: MOV * T1.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 17:
; CM-NEXT: LSHR T4.X, KC0[2].Y, literal.x,
; CM-NEXT: MOV * T3.Y, T1.X,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
%ext = zext <3 x i16> %ld to <3 x i32>
store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
; GCN-NOHSA-SI-LABEL: global_sextload_v3i16_to_v3i32:
; GCN-NOHSA-SI: ; %bb.0: ; %entry
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v0, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v1, 0, 16
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx2 v[3:4], v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v3
; GCN-HSA-NEXT: v_bfe_i32 v2, v4, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v3, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx3 v[5:6], v[0:2]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v3i16_to_v3i32:
; GCN-NOHSA-VI: ; %bb.0: ; %entry
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[3:4], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v3
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v4, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v3, 0, 16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx3 v[0:2], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v3i16_to_v3i32:
; EG: ; %bb.0: ; %entry
; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 2 @6
; EG-NEXT: ALU 9, @13, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T1.X, T0.X, 2, #1
; EG-NEXT: VTX_READ_16 T2.X, T0.X, 4, #1
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 13:
; EG-NEXT: BFE_INT * T0.Y, T1.X, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
; EG-NEXT: BFE_INT T2.X, T2.X, 0.0, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; EG-NEXT: LSHR * T3.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_sextload_v3i16_to_v3i32:
; CM: ; %bb.0: ; %entry
; CM-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 2 @6
; CM-NEXT: ALU 9, @13, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T2.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T1.X, T3.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T1.X, T0.X, 4, #1
; CM-NEXT: VTX_READ_16 T2.X, T0.X, 0, #1
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 2, #1
; CM-NEXT: ALU clause starting at 12:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 13:
; CM-NEXT: BFE_INT T1.X, T1.X, 0.0, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 8(1.121039e-44)
; CM-NEXT: LSHR T3.X, PV.W, literal.x,
; CM-NEXT: BFE_INT * T0.Y, T0.X, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: BFE_INT * T0.X, T2.X, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T2.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
entry:
%ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
%ext = sext <3 x i16> %ld to <3 x i32>
store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
ret void
}
; TODO: This should use DST, but for some there are redundant MOVs
define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_v4i16_to_v4i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s2, v5
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s2, v4
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GCN-HSA-NEXT: s_mov_b32 s2, 0xffff
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GCN-HSA-NEXT: v_and_b32_e32 v2, s2, v5
; GCN-HSA-NEXT: v_and_b32_e32 v0, s2, v4
; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s6, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s6, v0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v4i16_to_v4i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 12, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MOV T2.X, T5.X,
; EG-NEXT: MOV * T3.X, T5.Y,
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: MOV * T0.Z, PS,
; EG-NEXT: LSHR * T5.W, PV.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T5.Z, T0.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHR * T5.Y, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T5.X, T0.Y, literal.x,
; EG-NEXT: LSHR * T6.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
;
; CM-LABEL: global_zextload_v4i16_to_v4i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 13, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T5.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: MOV * T2.X, T5.X,
; CM-NEXT: MOV T3.X, T5.Y,
; CM-NEXT: MOV * T0.Y, PV.X,
; CM-NEXT: MOV * T0.Z, PV.X,
; CM-NEXT: LSHR * T5.W, PV.Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT * T5.Z, T0.Z, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR * T5.Y, T0.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT * T5.X, T0.Y, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR * T6.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in
%ext = zext <4 x i16> %load to <4 x i32>
store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
ret void
}
; TODO: We should use ASHR instead of LSHR + BFE
; TODO: This should use DST, but for some there are redundant MOVs
define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_sextload_v4i16_to_v4i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[3:4], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v3
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[5:6], v[3:4], 48
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v4, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v3, 0, 16
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v5
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx2 v[3:4], v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_ashr_i64 v[7:8], v[3:4], 48
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v3
; GCN-HSA-NEXT: v_bfe_i32 v0, v3, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v2, v4, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v7
; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[4:5], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v5
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v4
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v4, 0, 16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v4i16_to_v4i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MOV T2.X, T5.X,
; EG-NEXT: MOV * T3.X, T5.Y,
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: MOV * T0.Z, PS,
; EG-NEXT: BFE_INT * T5.Z, PV.Z, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T5.X, T0.Y, 0.0, literal.x,
; EG-NEXT: LSHR * T0.W, T0.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T5.W, PV.W, 0.0, literal.x,
; EG-NEXT: LSHR * T0.W, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
; EG-NEXT: BFE_INT * T5.Y, PS, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; CM-LABEL: global_sextload_v4i16_to_v4i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 14, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T5.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: MOV * T2.X, T5.X,
; CM-NEXT: MOV T3.X, T5.Y,
; CM-NEXT: MOV * T0.Y, PV.X,
; CM-NEXT: MOV * T0.Z, PV.X,
; CM-NEXT: BFE_INT * T5.Z, PV.Z, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T5.X, T0.Y, 0.0, literal.x,
; CM-NEXT: LSHR * T0.W, T0.Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T0.Z, T0.Y, literal.x,
; CM-NEXT: BFE_INT * T5.W, PV.W, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
; CM-NEXT: BFE_INT * T5.Y, PV.Z, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in
%ext = sext <4 x i16> %load to <4 x i32>
store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
ret void
}
; TODO: These should use LSHR instead of BFE_UINT
define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v3
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v2
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, s2, v1
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s2, v0
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s2, v3
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s2, v2
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2
; GCN-HSA-NEXT: v_and_b32_e32 v10, s4, v3
; GCN-HSA-NEXT: v_and_b32_e32 v8, s4, v2
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GCN-HSA-NEXT: v_and_b32_e32 v6, s4, v1
; GCN-HSA-NEXT: v_and_b32_e32 v4, s4, v0
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v3
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s6, v3
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v2
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s6, v2
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s6, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s6, v0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v8i16_to_v8i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T8.W, T7.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T8.Z, T7.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHR T8.Y, T7.X, literal.x,
; EG-NEXT: LSHR * T9.W, T7.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T8.X, T7.X, literal.x,
; EG-NEXT: AND_INT T9.Z, T7.W, literal.x,
; EG-NEXT: LSHR * T7.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
; EG-NEXT: LSHR * T9.Y, T7.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T9.X, T7.Z, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: LSHR * T10.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_zextload_v8i16_to_v8i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 17, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T10.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T9.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T7.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: LSHR * T8.W, T7.W, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT * T8.Z, T7.W, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR T8.Y, T7.Z, literal.x,
; CM-NEXT: LSHR * T7.W, T7.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT T8.X, T7.Z, literal.x,
; CM-NEXT: AND_INT T7.Z, T7.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: LSHR T9.X, PV.W, literal.x,
; CM-NEXT: LSHR * T7.Y, T7.X, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT * T7.X, T7.X, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR * T10.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <8 x i16>, <8 x i16> addrspace(1)* %in
%ext = zext <8 x i16> %load to <8 x i32>
store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
ret void
}
; TODO: These should use ASHR instead of LSHR + BFE_INT
define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_sextload_v8i16_to_v8i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v1
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v0, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v3
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v2
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v3, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v2, 0, 16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v2
; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0
; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v3
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v2
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v3, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v1
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v0, 0, 16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v8i16_to_v8i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T10.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: BFE_INT * T8.Z, T7.Y, 0.0, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T8.X, T7.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T9.Z, T7.W, 0.0, literal.x,
; EG-NEXT: LSHR * T0.W, T7.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T9.X, T7.Z, 0.0, literal.x,
; EG-NEXT: LSHR T0.Z, T7.W, literal.x,
; EG-NEXT: BFE_INT T8.W, PV.W, 0.0, literal.x,
; EG-NEXT: LSHR * T0.W, T7.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
; EG-NEXT: BFE_INT T8.Y, PS, 0.0, literal.y,
; EG-NEXT: LSHR T1.Z, T7.Z, literal.y,
; EG-NEXT: BFE_INT T9.W, PV.Z, 0.0, literal.y,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR T10.X, PS, literal.x,
; EG-NEXT: BFE_INT * T9.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; CM-LABEL: global_sextload_v8i16_to_v8i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 19, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T9, T7.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T10.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T7.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: BFE_INT * T8.Z, T7.W, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T8.X, T7.Z, 0.0, literal.x,
; CM-NEXT: LSHR T0.Y, T7.Y, literal.x,
; CM-NEXT: BFE_INT T9.Z, T7.Y, 0.0, literal.x,
; CM-NEXT: LSHR * T0.W, T7.W, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T9.X, T7.X, 0.0, literal.x,
; CM-NEXT: LSHR T1.Y, T7.Z, literal.x,
; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.x,
; CM-NEXT: BFE_INT * T8.W, PV.W, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T10.X, PV.Z, literal.x,
; CM-NEXT: BFE_INT T8.Y, PV.Y, 0.0, literal.y,
; CM-NEXT: LSHR T0.Z, T7.X, literal.y,
; CM-NEXT: BFE_INT * T9.W, T0.Y, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
; CM-NEXT: BFE_INT * T9.Y, PV.Z, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
%load = load <8 x i16>, <8 x i16> addrspace(1)* %in
%ext = sext <8 x i16> %load to <8 x i32>
store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v1
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v3
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v2
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v5
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v4
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s2, v1
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s2, v0
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, s2, v3
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s2, v2
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v7
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s2, v5
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s2, v4
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s2, v7
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s2, v6
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1
; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v3
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v7
; GCN-HSA-NEXT: v_and_b32_e32 v9, s4, v7
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6
; GCN-HSA-NEXT: v_and_b32_e32 v7, s4, v6
; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[7:10]
; GCN-HSA-NEXT: v_and_b32_e32 v12, s4, v3
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v1
; GCN-HSA-NEXT: v_and_b32_e32 v8, s4, v1
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v0
; GCN-HSA-NEXT: v_and_b32_e32 v6, s4, v0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v2
; GCN-HSA-NEXT: v_and_b32_e32 v10, s4, v2
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GCN-HSA-NEXT: v_and_b32_e32 v2, s4, v5
; GCN-HSA-NEXT: v_and_b32_e32 v0, s4, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3]
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[10:13]
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[6:9]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v7
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, s6, v7
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v6
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s6, v6
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s6, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s6, v0
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v3
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, s6, v3
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v2
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s6, v2
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s6, v5
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s6, v4
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v16i16_to_v16i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @8
; EG-NEXT: ALU 35, @13, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T18.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T11.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T16.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T12.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: MOV * T11.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 13:
; EG-NEXT: LSHR * T13.W, T12.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T13.Z, T12.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHR T13.Y, T12.X, literal.x,
; EG-NEXT: LSHR * T14.W, T12.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T13.X, T12.X, literal.x,
; EG-NEXT: AND_INT T14.Z, T12.W, literal.x,
; EG-NEXT: LSHR * T12.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
; EG-NEXT: LSHR T14.Y, T12.Z, literal.x,
; EG-NEXT: LSHR * T15.W, T11.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T14.X, T12.Z, literal.x,
; EG-NEXT: AND_INT T15.Z, T11.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: LSHR T16.X, PV.W, literal.x,
; EG-NEXT: LSHR T15.Y, T11.X, literal.y,
; EG-NEXT: LSHR T17.W, T11.W, literal.y,
; EG-NEXT: AND_INT * T15.X, T11.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T17.Z, T11.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
; EG-NEXT: LSHR T11.X, PV.W, literal.x,
; EG-NEXT: LSHR T17.Y, T11.Z, literal.y,
; EG-NEXT: AND_INT * T17.X, T11.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; EG-NEXT: LSHR * T18.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_zextload_v16i16_to_v16i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 1 @8
; CM-NEXT: ALU 33, @13, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T18.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T15, T17.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T16.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T13, T14.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 8:
; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
; CM-NEXT: ALU clause starting at 12:
; CM-NEXT: MOV * T11.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 13:
; CM-NEXT: LSHR * T13.W, T12.W, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT * T13.Z, T12.W, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR T13.Y, T12.Z, literal.x,
; CM-NEXT: LSHR * T12.W, T12.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT T13.X, T12.Z, literal.x,
; CM-NEXT: AND_INT T12.Z, T12.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
; CM-NEXT: LSHR T14.X, PV.W, literal.x,
; CM-NEXT: LSHR T12.Y, T12.X, literal.y,
; CM-NEXT: LSHR * T15.W, T11.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T12.X, T12.X, literal.x,
; CM-NEXT: AND_INT T15.Z, T11.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
; CM-NEXT: LSHR T16.X, PV.W, literal.x,
; CM-NEXT: LSHR T15.Y, T11.Z, literal.y,
; CM-NEXT: LSHR * T11.W, T11.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T15.X, T11.Z, literal.x,
; CM-NEXT: AND_INT T11.Z, T11.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: LSHR T17.X, PV.W, literal.x,
; CM-NEXT: LSHR * T11.Y, T11.X, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT * T11.X, T11.X, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR * T18.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <16 x i16>, <16 x i16> addrspace(1)* %in
%ext = zext <16 x i16> %load to <16 x i32>
store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v1
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v0, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 16, v3
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 16, v2
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v3, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v2, 0, 16
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v5
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v4
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v4, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 16, v7
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 16, v6
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v7, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v6, 0, 16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v0
; GCN-HSA-NEXT: v_bfe_i32 v8, v0, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v3
; GCN-HSA-NEXT: v_bfe_i32 v14, v3, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v2
; GCN-HSA-NEXT: v_bfe_i32 v12, v2, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v1
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
; GCN-HSA-NEXT: s_waitcnt vmcnt(2)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v5
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v10, 16, v7
; GCN-HSA-NEXT: v_bfe_i32 v9, v7, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 16, v6
; GCN-HSA-NEXT: v_bfe_i32 v7, v6, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4
; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[7:10]
; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v7
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v6
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v7, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v6, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v3
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v3, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v2
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v5
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v4
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v4, 0, 16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v16i16_to_v16i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @8
; EG-NEXT: ALU 39, @13, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T12.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T11.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T14.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T13.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
; EG-NEXT: ALU clause starting at 12:
; EG-NEXT: MOV * T11.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 13:
; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR T14.X, PV.W, literal.x,
; EG-NEXT: BFE_INT * T15.Z, T11.Y, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: BFE_INT T15.X, T11.X, 0.0, literal.x,
; EG-NEXT: LSHR T0.Y, T12.W, literal.x,
; EG-NEXT: BFE_INT T16.Z, T11.W, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: LSHR T0.W, T12.Y, literal.x,
; EG-NEXT: LSHR * T1.W, T11.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T16.X, T11.Z, 0.0, literal.x,
; EG-NEXT: LSHR T1.Y, T11.W, literal.x,
; EG-NEXT: BFE_INT T17.Z, T12.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T15.W, PS, 0.0, literal.x,
; EG-NEXT: LSHR * T1.W, T11.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T17.X, T12.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T15.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T18.Z, T12.W, 0.0, literal.x,
; EG-NEXT: BFE_INT T16.W, PV.Y, 0.0, literal.x,
; EG-NEXT: LSHR * T1.W, T11.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T18.X, T12.Z, 0.0, literal.x,
; EG-NEXT: BFE_INT T16.Y, PS, 0.0, literal.x,
; EG-NEXT: LSHR T0.Z, T12.X, literal.x,
; EG-NEXT: BFE_INT T17.W, T0.W, 0.0, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 32(4.484155e-44)
; EG-NEXT: LSHR T11.X, PS, literal.x,
; EG-NEXT: BFE_INT T17.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: LSHR T0.Z, T12.Z, literal.y,
; EG-NEXT: BFE_INT T18.W, T0.Y, 0.0, literal.y,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T12.X, PS, literal.x,
; EG-NEXT: BFE_INT * T18.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; CM-LABEL: global_sextload_v16i16_to_v16i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @12, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 1 @8
; CM-NEXT: ALU 40, @13, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T17, T11.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T18.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T16, T14.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T15, T13.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 8:
; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
; CM-NEXT: ALU clause starting at 12:
; CM-NEXT: MOV * T11.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 13:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T13.X, PV.W, literal.x,
; CM-NEXT: LSHR T0.Y, T11.Y, literal.y,
; CM-NEXT: LSHR T0.Z, T11.Z, literal.y,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T14.X, PV.W, literal.x,
; CM-NEXT: LSHR T1.Y, T11.W, literal.y,
; CM-NEXT: BFE_INT T15.Z, T12.W, 0.0, literal.y, BS:VEC_120/SCL_212
; CM-NEXT: LSHR * T0.W, T12.X, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: BFE_INT T15.X, T12.Z, 0.0, literal.x,
; CM-NEXT: LSHR T2.Y, T12.Y, literal.x,
; CM-NEXT: BFE_INT T16.Z, T12.Y, 0.0, literal.x,
; CM-NEXT: LSHR * T1.W, T12.W, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T16.X, T12.X, 0.0, literal.x,
; CM-NEXT: LSHR T3.Y, T12.Z, literal.x,
; CM-NEXT: BFE_INT T12.Z, T11.W, 0.0, literal.x,
; CM-NEXT: BFE_INT * T15.W, PV.W, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T12.X, T11.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T15.Y, PV.Y, 0.0, literal.x,
; CM-NEXT: BFE_INT T17.Z, T11.Y, 0.0, literal.x,
; CM-NEXT: BFE_INT * T16.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T17.X, T11.X, 0.0, literal.x,
; CM-NEXT: BFE_INT T16.Y, T0.W, 0.0, literal.x,
; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
; CM-NEXT: BFE_INT * T12.W, T1.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T18.X, PV.Z, literal.x,
; CM-NEXT: BFE_INT T12.Y, T0.Z, 0.0, literal.y,
; CM-NEXT: LSHR T0.Z, T11.X, literal.y,
; CM-NEXT: BFE_INT * T17.W, T0.Y, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
; CM-NEXT: BFE_INT * T17.Y, PV.Z, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
%load = load <16 x i16>, <16 x i16> addrspace(1)* %in
%ext = sext <16 x i16> %load to <16 x i32>
store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_v32i16_to_v32i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v3
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v2
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v1
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v0
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v7
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v3
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s0, v2
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, s0, v1
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, s0, v0
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v5
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v7
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, s0, v6
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s0, v5
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v4
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v31, 16, v9
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v8
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, s0, v11
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v10
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, s0, v9
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, s0, v8
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v13
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v12
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s0, v15
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v14
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, s0, v13
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, s0, v12
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_mov_b32 s14, 0xffff
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: s_add_u32 s2, s2, 48
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x60
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x70
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
; GCN-HSA-NEXT: s_add_u32 s8, s0, 64
; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x50
; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
; GCN-HSA-NEXT: s_add_u32 s12, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v1
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v0
; GCN-HSA-NEXT: v_and_b32_e32 v18, s14, v1
; GCN-HSA-NEXT: v_and_b32_e32 v16, s14, v0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v5
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v4
; GCN-HSA-NEXT: v_and_b32_e32 v18, s14, v5
; GCN-HSA-NEXT: v_and_b32_e32 v16, s14, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7
; GCN-HSA-NEXT: v_and_b32_e32 v18, s14, v7
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6
; GCN-HSA-NEXT: v_and_b32_e32 v16, s14, v6
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19]
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: s_add_u32 s0, s0, 48
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13
; GCN-HSA-NEXT: v_and_b32_e32 v6, s14, v13
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12
; GCN-HSA-NEXT: v_and_b32_e32 v4, s14, v12
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v15
; GCN-HSA-NEXT: v_and_b32_e32 v17, s14, v15
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v14
; GCN-HSA-NEXT: v_and_b32_e32 v15, s14, v14
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18]
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v9
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v8
; GCN-HSA-NEXT: v_and_b32_e32 v15, s14, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s1
; GCN-HSA-NEXT: v_and_b32_e32 v17, s14, v9
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; GCN-HSA-NEXT: v_and_b32_e32 v5, s14, v3
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2
; GCN-HSA-NEXT: v_and_b32_e32 v3, s14, v2
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v11
; GCN-HSA-NEXT: v_and_b32_e32 v13, s14, v11
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v10
; GCN-HSA-NEXT: v_and_b32_e32 v11, s14, v10
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18]
; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[11:14]
; GCN-HSA-NEXT: flat_store_dwordx4 v[7:8], v[3:6]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v3
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, s0, v3
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v2
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v2
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s0, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(2)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v7
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, s0, v7
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v6
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s0, v6
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v5
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s0, v5
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v4
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v4
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v11
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, s0, v11
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v10
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s0, v10
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v9
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s0, v9
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v8
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v8
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v15
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, s0, v15
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v14
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v13
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, s0, v13
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v12
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v12
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v32i16_to_v32i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @20, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 3 @12
; EG-NEXT: ALU 72, @21, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T34.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T33.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T32.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T30.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T29.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T27.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T26.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T24.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 12:
; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
; EG-NEXT: VTX_READ_128 T21.XYZW, T19.X, 48, #1
; EG-NEXT: VTX_READ_128 T22.XYZW, T19.X, 32, #1
; EG-NEXT: VTX_READ_128 T19.XYZW, T19.X, 16, #1
; EG-NEXT: ALU clause starting at 20:
; EG-NEXT: MOV * T19.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 21:
; EG-NEXT: LSHR * T23.W, T20.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T23.Z, T20.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHR T23.Y, T20.Z, literal.x,
; EG-NEXT: LSHR * T20.W, T20.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T23.X, T20.Z, literal.x,
; EG-NEXT: AND_INT T20.Z, T20.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: LSHR T24.X, PV.W, literal.x,
; EG-NEXT: LSHR T20.Y, T20.X, literal.y,
; EG-NEXT: LSHR T25.W, T19.W, literal.y,
; EG-NEXT: AND_INT * T20.X, T20.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT * T25.Z, T19.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHR T26.X, KC0[2].Y, literal.x,
; EG-NEXT: LSHR T25.Y, T19.Z, literal.y,
; EG-NEXT: LSHR T19.W, T19.Y, literal.y,
; EG-NEXT: AND_INT * T25.X, T19.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T19.Z, T19.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
; EG-NEXT: LSHR T27.X, PV.W, literal.x,
; EG-NEXT: LSHR T19.Y, T19.X, literal.y,
; EG-NEXT: LSHR T28.W, T22.W, literal.y,
; EG-NEXT: AND_INT * T19.X, T19.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T28.Z, T22.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
; EG-NEXT: LSHR T29.X, PV.W, literal.x,
; EG-NEXT: LSHR T28.Y, T22.Z, literal.y,
; EG-NEXT: LSHR T22.W, T22.Y, literal.y,
; EG-NEXT: AND_INT * T28.X, T22.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T22.Z, T22.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
; EG-NEXT: LSHR T30.X, PV.W, literal.x,
; EG-NEXT: LSHR T22.Y, T22.X, literal.y,
; EG-NEXT: LSHR T31.W, T21.W, literal.y,
; EG-NEXT: AND_INT * T22.X, T22.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T31.Z, T21.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
; EG-NEXT: LSHR T32.X, PV.W, literal.x,
; EG-NEXT: LSHR T31.Y, T21.Z, literal.y,
; EG-NEXT: LSHR T21.W, T21.Y, literal.y,
; EG-NEXT: AND_INT * T31.X, T21.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T21.Z, T21.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
; EG-NEXT: LSHR T33.X, PV.W, literal.x,
; EG-NEXT: LSHR T21.Y, T21.X, literal.y,
; EG-NEXT: AND_INT * T21.X, T21.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
; EG-NEXT: LSHR * T34.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_zextload_v32i16_to_v32i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @20, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 3 @12
; CM-NEXT: ALU 65, @21, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T33, T34.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T31, T21.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T30, T32.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T28, T22.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T27, T29.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T25, T19.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T24, T26.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T20.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 12:
; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 48, #1
; CM-NEXT: VTX_READ_128 T21.XYZW, T19.X, 0, #1
; CM-NEXT: VTX_READ_128 T22.XYZW, T19.X, 16, #1
; CM-NEXT: VTX_READ_128 T19.XYZW, T19.X, 32, #1
; CM-NEXT: ALU clause starting at 20:
; CM-NEXT: MOV * T19.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 21:
; CM-NEXT: LSHR * T23.W, T20.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT * T23.Z, T20.Y, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR T23.Y, T20.X, literal.x,
; CM-NEXT: LSHR * T24.W, T20.W, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT T23.X, T20.X, literal.x,
; CM-NEXT: AND_INT T24.Z, T20.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
; CM-NEXT: LSHR T20.X, PV.W, literal.x,
; CM-NEXT: LSHR T24.Y, T20.Z, literal.y,
; CM-NEXT: LSHR * T25.W, T19.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T24.X, T20.Z, literal.x,
; CM-NEXT: AND_INT T25.Z, T19.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
; CM-NEXT: LSHR T26.X, PV.W, literal.x,
; CM-NEXT: LSHR T25.Y, T19.X, literal.y,
; CM-NEXT: LSHR * T27.W, T19.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T25.X, T19.X, literal.x,
; CM-NEXT: AND_INT T27.Z, T19.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
; CM-NEXT: LSHR T19.X, PV.W, literal.x,
; CM-NEXT: LSHR T27.Y, T19.Z, literal.y,
; CM-NEXT: LSHR * T28.W, T22.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T27.X, T19.Z, literal.x,
; CM-NEXT: AND_INT T28.Z, T22.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
; CM-NEXT: LSHR T29.X, PV.W, literal.x,
; CM-NEXT: LSHR T28.Y, T22.X, literal.y,
; CM-NEXT: LSHR * T30.W, T22.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T28.X, T22.X, literal.x,
; CM-NEXT: AND_INT T30.Z, T22.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
; CM-NEXT: LSHR T22.X, PV.W, literal.x,
; CM-NEXT: LSHR T30.Y, T22.Z, literal.y,
; CM-NEXT: LSHR * T31.W, T21.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T30.X, T22.Z, literal.x,
; CM-NEXT: AND_INT T31.Z, T21.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
; CM-NEXT: LSHR T32.X, PV.W, literal.x,
; CM-NEXT: LSHR T31.Y, T21.X, literal.y,
; CM-NEXT: LSHR * T33.W, T21.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T31.X, T21.X, literal.x,
; CM-NEXT: AND_INT * T33.Z, T21.W, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR T21.X, KC0[2].Y, literal.x,
; CM-NEXT: LSHR * T33.Y, T21.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T33.X, T21.Z, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: LSHR * T34.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <32 x i16>, <32 x i16> addrspace(1)* %in
%ext = zext <32 x i16> %load to <32 x i32>
store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_sextload_v32i16_to_v32i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 16, v3
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 16, v2
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v3, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v2, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v1
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 16, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v0, 0, 16
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(2)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v7
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v6
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v7, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v6, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 16, v5
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v4
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v5, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v4, 0, 16
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v11
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v10
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v11, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v10, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v9
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 16, v8
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v9, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v8, 0, 16
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v15
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v14
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v15, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v14, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v35, 16, v13
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v33, 16, v12
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v34, v13, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v32, v12, 0, 16
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 48
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v0
; GCN-HSA-NEXT: v_bfe_i32 v18, v1, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v16, v0, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v3
; GCN-HSA-NEXT: v_bfe_i32 v18, v3, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v2
; GCN-HSA-NEXT: v_bfe_i32 v16, v2, 0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19]
; GCN-HSA-NEXT: s_waitcnt vmcnt(4)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v5
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4
; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 48
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v7
; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v6
; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16
; GCN-HSA-NEXT: s_waitcnt vmcnt(4)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v9
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v8
; GCN-HSA-NEXT: v_bfe_i32 v6, v9, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v4, v8, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v11
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v10
; GCN-HSA-NEXT: v_bfe_i32 v2, v11, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v10, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(6)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v13
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v12
; GCN-HSA-NEXT: v_bfe_i32 v6, v13, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v4, v12, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v15
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v14
; GCN-HSA-NEXT: v_bfe_i32 v2, v15, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v14, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v3
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v3, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v2
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v13
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v12
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v13, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v12, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 16, v1
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 16, v0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v7
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v7, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v6
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v6, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 16, v5
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v5, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 16, v4
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v4, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v11
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v11, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v10
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v10, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v9
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v9, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 16, v8
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v8, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v15
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v14
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v15, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v14, 0, 16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v32i16_to_v32i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 9, @20, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 3 @12
; EG-NEXT: ALU 73, @30, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T22.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T34.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T28.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T27.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T26.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T21.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T20.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T19.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 12:
; EG-NEXT: VTX_READ_128 T23.XYZW, T22.X, 16, #1
; EG-NEXT: VTX_READ_128 T24.XYZW, T22.X, 32, #1
; EG-NEXT: VTX_READ_128 T25.XYZW, T22.X, 0, #1
; EG-NEXT: VTX_READ_128 T22.XYZW, T22.X, 48, #1
; EG-NEXT: ALU clause starting at 20:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T19.X, PV.W, literal.x,
; EG-NEXT: LSHR * T20.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T21.X, PV.W, literal.x,
; EG-NEXT: MOV * T22.X, KC0[2].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ALU clause starting at 30:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T26.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; EG-NEXT: LSHR T27.X, PV.W, literal.x,
; EG-NEXT: LSHR T0.W, T22.Y, literal.y,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T28.X, PS, literal.x,
; EG-NEXT: LSHR T0.Y, T22.W, literal.y,
; EG-NEXT: BFE_INT T29.Z, T25.W, 0.0, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: LSHR T1.W, T24.Y, literal.y,
; EG-NEXT: LSHR * T2.W, T24.W, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: BFE_INT T29.X, T25.Z, 0.0, literal.x,
; EG-NEXT: LSHR T1.Y, T23.Y, literal.x,
; EG-NEXT: BFE_INT T30.Z, T25.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: LSHR T3.W, T23.W, literal.x,
; EG-NEXT: LSHR * T4.W, T25.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T30.X, T25.X, 0.0, literal.x,
; EG-NEXT: LSHR T2.Y, T25.Y, literal.x,
; EG-NEXT: BFE_INT T31.Z, T23.W, 0.0, literal.x,
; EG-NEXT: BFE_INT T29.W, PS, 0.0, literal.x,
; EG-NEXT: LSHR * T4.W, T25.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T31.X, T23.Z, 0.0, literal.x,
; EG-NEXT: BFE_INT T29.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T25.Z, T23.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T30.W, PV.Y, 0.0, literal.x,
; EG-NEXT: LSHR * T4.W, T25.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T25.X, T23.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T30.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T32.Z, T24.W, 0.0, literal.x,
; EG-NEXT: BFE_INT T31.W, T3.W, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: LSHR * T3.W, T23.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T32.X, T24.Z, 0.0, literal.x,
; EG-NEXT: BFE_INT T31.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T23.Z, T24.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T25.W, T1.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: LSHR * T3.W, T23.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T23.X, T24.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T25.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T33.Z, T22.W, 0.0, literal.x,
; EG-NEXT: BFE_INT T32.W, T2.W, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: LSHR * T2.W, T24.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T33.X, T22.Z, 0.0, literal.x,
; EG-NEXT: BFE_INT T32.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T24.Z, T22.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T23.W, T1.W, 0.0, literal.x,
; EG-NEXT: LSHR * T1.W, T24.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T24.X, T22.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T23.Y, PS, 0.0, literal.x,
; EG-NEXT: LSHR T0.Z, T22.Z, literal.x,
; EG-NEXT: BFE_INT T33.W, T0.Y, 0.0, literal.x,
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 112(1.569454e-43)
; EG-NEXT: LSHR T34.X, PS, literal.x,
; EG-NEXT: BFE_INT T33.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: LSHR T0.Z, T22.X, literal.y,
; EG-NEXT: BFE_INT T24.W, T0.W, 0.0, literal.y,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 96(1.345247e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T22.X, PS, literal.x,
; EG-NEXT: BFE_INT * T24.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; CM-LABEL: global_sextload_v32i16_to_v32i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @22, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @14
; CM-NEXT: ALU 7, @23, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 2 @16
; CM-NEXT: ALU 76, @31, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T33, T34.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T20.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T32, T28.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T27.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T31, T26.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T25.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T30, T24.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T29, T21.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 14:
; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
; CM-NEXT: Fetch clause starting at 16:
; CM-NEXT: VTX_READ_128 T22.XYZW, T19.X, 48, #1
; CM-NEXT: VTX_READ_128 T23.XYZW, T19.X, 32, #1
; CM-NEXT: VTX_READ_128 T19.XYZW, T19.X, 16, #1
; CM-NEXT: ALU clause starting at 22:
; CM-NEXT: MOV * T19.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 23:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 96(1.345247e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T21.X, PV.W, literal.x,
; CM-NEXT: LSHR T0.Y, T20.Z, literal.y,
; CM-NEXT: LSHR T0.Z, T20.W, literal.y,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 112(1.569454e-43), 0(0.000000e+00)
; CM-NEXT: ALU clause starting at 31:
; CM-NEXT: LSHR T24.X, T0.W, literal.x,
; CM-NEXT: LSHR T1.Y, T20.Y, literal.y,
; CM-NEXT: LSHR T1.Z, T19.Z, literal.y,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 64(8.968310e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T25.X, PV.W, literal.x,
; CM-NEXT: LSHR T2.Y, T19.W, literal.y,
; CM-NEXT: LSHR T2.Z, T19.X, literal.y,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 80(1.121039e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T26.X, PV.W, literal.x,
; CM-NEXT: LSHR T3.Y, T19.Y, literal.y,
; CM-NEXT: LSHR T3.Z, T23.Z, literal.y,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T27.X, PV.W, literal.x,
; CM-NEXT: LSHR T4.Y, T23.W, literal.y,
; CM-NEXT: LSHR T4.Z, T23.X, literal.y,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T28.X, PV.W, literal.x,
; CM-NEXT: LSHR T5.Y, T23.Y, literal.y,
; CM-NEXT: BFE_INT T29.Z, T22.Y, 0.0, literal.y, BS:VEC_120/SCL_212
; CM-NEXT: LSHR * T0.W, T22.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: BFE_INT T29.X, T22.X, 0.0, literal.x,
; CM-NEXT: LSHR T6.Y, T22.W, literal.x,
; CM-NEXT: BFE_INT T30.Z, T22.W, 0.0, literal.x,
; CM-NEXT: LSHR * T1.W, T22.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T30.X, T22.Z, 0.0, literal.x,
; CM-NEXT: LSHR T7.Y, T22.X, literal.x,
; CM-NEXT: BFE_INT T22.Z, T23.Y, 0.0, literal.x,
; CM-NEXT: BFE_INT * T29.W, PV.W, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T22.X, T23.X, 0.0, literal.x,
; CM-NEXT: BFE_INT T29.Y, PV.Y, 0.0, literal.x,
; CM-NEXT: BFE_INT T31.Z, T23.W, 0.0, literal.x,
; CM-NEXT: BFE_INT * T30.W, T6.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T31.X, T23.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T30.Y, T0.W, 0.0, literal.x,
; CM-NEXT: BFE_INT T23.Z, T19.Y, 0.0, literal.x,
; CM-NEXT: BFE_INT * T22.W, T5.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T23.X, T19.X, 0.0, literal.x,
; CM-NEXT: BFE_INT T22.Y, T4.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T32.Z, T19.W, 0.0, literal.x,
; CM-NEXT: BFE_INT * T31.W, T4.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T32.X, T19.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T31.Y, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: BFE_INT T19.Z, T20.Y, 0.0, literal.x,
; CM-NEXT: BFE_INT * T23.W, T3.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T19.X, T20.X, 0.0, literal.x,
; CM-NEXT: BFE_INT T23.Y, T2.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T33.Z, T20.W, 0.0, literal.x,
; CM-NEXT: BFE_INT * T32.W, T2.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T33.X, T20.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T32.Y, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: LSHR T1.Z, T20.X, literal.x,
; CM-NEXT: BFE_INT * T19.W, T1.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T20.X, KC0[2].Y, literal.x,
; CM-NEXT: BFE_INT T19.Y, PV.Z, 0.0, literal.y,
; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.y,
; CM-NEXT: BFE_INT * T33.W, T0.Z, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: LSHR T34.X, PV.Z, literal.x,
; CM-NEXT: BFE_INT * T33.Y, T0.Y, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
%load = load <32 x i16>, <32 x i16> addrspace(1)* %in
%ext = sext <32 x i16> %load to <32 x i32>
store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_v64i16_to_v64i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s3
; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(7)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v3
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v2
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v39, 16, v1
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v37, 16, v0
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(6)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v43, 16, v7
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v41, 16, v6
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, s0, v3
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, s0, v2
; GCN-NOHSA-SI-NEXT: buffer_store_dword v32, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dword v33, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: buffer_store_dword v34, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: buffer_store_dword v35, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, s0, v1
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v36, s0, v0
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v35, 16, v5
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v4
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, s0, v7
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v40, s0, v6
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, s0, v5
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v32, s0, v4
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v11
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v10
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v47, 16, v9
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v45, 16, v8
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v6, s0, v11
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v10
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, s0, v9
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v44, s0, v8
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v11, 16, v15
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v14
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v51, 16, v13
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v49, 16, v12
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v10, s0, v15
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v14
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, s0, v13
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v48, s0, v12
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v19
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v18
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v55, 16, v17
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v53, 16, v16
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v14, s0, v19
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s0, v18
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, s0, v17
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v52, s0, v16
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v19, 16, v23
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v22
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v59, 16, v21
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v57, 16, v20
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v23
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s0, v22
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, s0, v21
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v56, s0, v20
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v27
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v26
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v63, 16, v25
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v61, 16, v24
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, s0, v27
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, s0, v26
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v62, s0, v25
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v60, s0, v24
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v27, 16, v31
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v30
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v3, 16, v29
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v28
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v31
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, s0, v30
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v2, s0, v29
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v28
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:240
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:144
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_movk_i32 s14, 0x50
; GCN-HSA-NEXT: s_movk_i32 s15, 0x60
; GCN-HSA-NEXT: s_movk_i32 s16, 0x70
; GCN-HSA-NEXT: s_mov_b32 s17, 0xffff
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, s14
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_add_u32 s4, s2, s15
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: s_add_u32 s4, s2, s16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: s_add_u32 s6, s2, 48
; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2
; GCN-HSA-NEXT: s_add_u32 s2, s2, 64
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6
; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21]
; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29]
; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33]
; GCN-HSA-NEXT: s_add_u32 s4, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xf0
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xc0
; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xd0
; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xa0
; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0
; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v1
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v0
; GCN-HSA-NEXT: v_and_b32_e32 v26, s17, v1
; GCN-HSA-NEXT: v_and_b32_e32 v24, s17, v0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13
; GCN-HSA-NEXT: s_add_u32 s12, s0, 0xb0
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v3
; GCN-HSA-NEXT: v_and_b32_e32 v26, s17, v3
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v2
; GCN-HSA-NEXT: v_and_b32_e32 v24, s17, v2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
; GCN-HSA-NEXT: s_waitcnt vmcnt(8)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v5
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v5
; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v7
; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v7
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v6
; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v6
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_waitcnt vmcnt(9)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v9
; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v9
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v8
; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v8
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v11
; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v11
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v10
; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v10
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
; GCN-HSA-NEXT: v_mov_b32_e32 v27, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(6)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v33
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v32
; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v33
; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v32
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s4
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v35
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v34
; GCN-HSA-NEXT: v_and_b32_e32 v6, s17, v35
; GCN-HSA-NEXT: v_and_b32_e32 v4, s17, v34
; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, s15
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v29
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v28
; GCN-HSA-NEXT: v_and_b32_e32 v10, s17, v29
; GCN-HSA-NEXT: v_and_b32_e32 v8, s17, v28
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11]
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v31
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v30
; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v31
; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v30
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, s16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20
; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v21
; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v20
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v23
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22
; GCN-HSA-NEXT: v_and_b32_e32 v6, s17, v23
; GCN-HSA-NEXT: v_and_b32_e32 v4, s17, v22
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v15
; GCN-HSA-NEXT: v_and_b32_e32 v2, s17, v15
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14
; GCN-HSA-NEXT: v_and_b32_e32 v0, s17, v14
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13
; GCN-HSA-NEXT: v_and_b32_e32 v6, s17, v13
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12
; GCN-HSA-NEXT: v_and_b32_e32 v4, s17, v12
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v17
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v16
; GCN-HSA-NEXT: v_and_b32_e32 v14, s17, v17
; GCN-HSA-NEXT: v_and_b32_e32 v12, s17, v16
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, s14
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v19
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v18
; GCN-HSA-NEXT: v_and_b32_e32 v10, s17, v19
; GCN-HSA-NEXT: v_and_b32_e32 v8, s17, v18
; GCN-HSA-NEXT: s_add_u32 s0, s0, 48
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-HSA-NEXT: s_nop 0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000
; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s3
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:96
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:64
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff
; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v15
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, s0, v15
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v14
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v14
; GCN-NOHSA-VI-NEXT: buffer_store_dword v28, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: buffer_store_dword v29, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
; GCN-NOHSA-VI-NEXT: buffer_store_dword v30, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
; GCN-NOHSA-VI-NEXT: buffer_store_dword v31, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[56:59], off, s[8:11], 0 offset:112
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v31, 16, v13
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v30, s0, v13
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v12
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v12
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v35, 16, v19
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v34, s0, v19
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v33, 16, v18
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, s0, v18
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v19, 16, v17
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v18, s0, v17
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v16
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v39, 16, v23
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v38, s0, v23
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v37, 16, v22
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, s0, v22
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v23, 16, v21
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v22, s0, v21
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v21, 16, v20
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s0, v20
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v43, 16, v27
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, s0, v27
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v26
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v40, s0, v26
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v27, 16, v25
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v26, s0, v25
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v25, 16, v24
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s0, v24
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v11
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v46, s0, v11
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v45, 16, v10
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v44, s0, v10
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v9
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v14, s0, v9
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v13, 16, v8
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v8
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v51, 16, v7
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v50, s0, v7
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v49, 16, v6
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, s0, v6
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v5
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v10, s0, v5
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v4
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v4
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v55, 16, v3
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, s0, v3
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v2
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v52, s0, v2
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v6, s0, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v61, 16, v59
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v60, s0, v59
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v59, 16, v58
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v58, s0, v58
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v57
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v2, s0, v57
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v56
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v56
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:240
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:192
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:208
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:160
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:176
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:128
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:144
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:112
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:80
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v64i16_to_v64i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @38, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 3 @22
; EG-NEXT: ALU 56, @39, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 3 @30
; EG-NEXT: ALU 87, @96, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T66.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T65.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T64.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T62.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T51.XYZW, T61.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T59.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T52.XYZW, T58.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T54.XYZW, T56.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T55.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T53.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T38.XYZW, T48.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T47.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T46.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T44.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T36.XYZW, T43.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T41.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 22:
; EG-NEXT: VTX_READ_128 T36.XYZW, T35.X, 0, #1
; EG-NEXT: VTX_READ_128 T37.XYZW, T35.X, 48, #1
; EG-NEXT: VTX_READ_128 T38.XYZW, T35.X, 32, #1
; EG-NEXT: VTX_READ_128 T39.XYZW, T35.X, 16, #1
; EG-NEXT: Fetch clause starting at 30:
; EG-NEXT: VTX_READ_128 T49.XYZW, T35.X, 112, #1
; EG-NEXT: VTX_READ_128 T50.XYZW, T35.X, 96, #1
; EG-NEXT: VTX_READ_128 T51.XYZW, T35.X, 80, #1
; EG-NEXT: VTX_READ_128 T52.XYZW, T35.X, 64, #1
; EG-NEXT: ALU clause starting at 38:
; EG-NEXT: MOV * T35.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 39:
; EG-NEXT: LSHR * T40.W, T36.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT * T40.Z, T36.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHR T40.Y, T36.Z, literal.x,
; EG-NEXT: LSHR * T36.W, T36.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T40.X, T36.Z, literal.x,
; EG-NEXT: AND_INT T36.Z, T36.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: LSHR T41.X, PV.W, literal.x,
; EG-NEXT: LSHR T36.Y, T36.X, literal.y,
; EG-NEXT: LSHR T42.W, T39.W, literal.y,
; EG-NEXT: AND_INT * T36.X, T36.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT * T42.Z, T39.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: LSHR T43.X, KC0[2].Y, literal.x,
; EG-NEXT: LSHR T42.Y, T39.Z, literal.y,
; EG-NEXT: LSHR T39.W, T39.Y, literal.y,
; EG-NEXT: AND_INT * T42.X, T39.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T39.Z, T39.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
; EG-NEXT: LSHR T44.X, PV.W, literal.x,
; EG-NEXT: LSHR T39.Y, T39.X, literal.y,
; EG-NEXT: LSHR T45.W, T38.W, literal.y,
; EG-NEXT: AND_INT * T39.X, T39.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T45.Z, T38.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
; EG-NEXT: LSHR T46.X, PV.W, literal.x,
; EG-NEXT: LSHR T45.Y, T38.Z, literal.y,
; EG-NEXT: LSHR T38.W, T38.Y, literal.y,
; EG-NEXT: AND_INT * T45.X, T38.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T38.Z, T38.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
; EG-NEXT: LSHR T47.X, PV.W, literal.x,
; EG-NEXT: LSHR T38.Y, T38.X, literal.y,
; EG-NEXT: AND_INT * T38.X, T38.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.x,
; EG-NEXT: LSHR * T35.W, T37.W, literal.y,
; EG-NEXT: 64(8.968310e-44), 16(2.242078e-44)
; EG-NEXT: LSHR T48.X, PV.W, literal.x,
; EG-NEXT: AND_INT * T35.Z, T37.W, literal.y,
; EG-NEXT: 2(2.802597e-45), 65535(9.183409e-41)
; EG-NEXT: ALU clause starting at 96:
; EG-NEXT: LSHR T35.Y, T37.Z, literal.x,
; EG-NEXT: LSHR * T37.W, T37.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T35.X, T37.Z, literal.x,
; EG-NEXT: AND_INT T37.Z, T37.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
; EG-NEXT: LSHR T53.X, PV.W, literal.x,
; EG-NEXT: LSHR T37.Y, T37.X, literal.y,
; EG-NEXT: LSHR T54.W, T52.W, literal.y,
; EG-NEXT: AND_INT * T37.X, T37.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T54.Z, T52.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
; EG-NEXT: LSHR T55.X, PV.W, literal.x,
; EG-NEXT: LSHR T54.Y, T52.Z, literal.y,
; EG-NEXT: LSHR T52.W, T52.Y, literal.y,
; EG-NEXT: AND_INT * T54.X, T52.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T52.Z, T52.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 144(2.017870e-43)
; EG-NEXT: LSHR T56.X, PV.W, literal.x,
; EG-NEXT: LSHR T52.Y, T52.X, literal.y,
; EG-NEXT: LSHR T57.W, T51.W, literal.y,
; EG-NEXT: AND_INT * T52.X, T52.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T57.Z, T51.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 128(1.793662e-43)
; EG-NEXT: LSHR T58.X, PV.W, literal.x,
; EG-NEXT: LSHR T57.Y, T51.Z, literal.y,
; EG-NEXT: LSHR T51.W, T51.Y, literal.y,
; EG-NEXT: AND_INT * T57.X, T51.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T51.Z, T51.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 176(2.466285e-43)
; EG-NEXT: LSHR T59.X, PV.W, literal.x,
; EG-NEXT: LSHR T51.Y, T51.X, literal.y,
; EG-NEXT: LSHR T60.W, T50.W, literal.y,
; EG-NEXT: AND_INT * T51.X, T51.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T60.Z, T50.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 160(2.242078e-43)
; EG-NEXT: LSHR T61.X, PV.W, literal.x,
; EG-NEXT: LSHR T60.Y, T50.Z, literal.y,
; EG-NEXT: LSHR T50.W, T50.Y, literal.y,
; EG-NEXT: AND_INT * T60.X, T50.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T50.Z, T50.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 208(2.914701e-43)
; EG-NEXT: LSHR T62.X, PV.W, literal.x,
; EG-NEXT: LSHR T50.Y, T50.X, literal.y,
; EG-NEXT: LSHR T63.W, T49.W, literal.y,
; EG-NEXT: AND_INT * T50.X, T50.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T63.Z, T49.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 192(2.690493e-43)
; EG-NEXT: LSHR T64.X, PV.W, literal.x,
; EG-NEXT: LSHR T63.Y, T49.Z, literal.y,
; EG-NEXT: LSHR T49.W, T49.Y, literal.y,
; EG-NEXT: AND_INT * T63.X, T49.Z, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: AND_INT T49.Z, T49.Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 240(3.363116e-43)
; EG-NEXT: LSHR T65.X, PV.W, literal.x,
; EG-NEXT: LSHR T49.Y, T49.X, literal.y,
; EG-NEXT: AND_INT * T49.X, T49.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00)
; EG-NEXT: LSHR * T66.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_zextload_v64i16_to_v64i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @38, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 3 @22
; CM-NEXT: ALU 50, @39, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 3 @30
; CM-NEXT: ALU 78, @90, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T65, T66.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T63, T48.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T62, T64.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T60, T49.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T59, T61.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T57, T50.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T56, T58.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T54, T51.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T53, T55.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T37.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T47, T52.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T45, T38.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T44, T46.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T42, T39.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T41, T43.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T40, T36.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 22:
; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 112, #1
; CM-NEXT: VTX_READ_128 T37.XYZW, T35.X, 64, #1
; CM-NEXT: VTX_READ_128 T38.XYZW, T35.X, 80, #1
; CM-NEXT: VTX_READ_128 T39.XYZW, T35.X, 96, #1
; CM-NEXT: Fetch clause starting at 30:
; CM-NEXT: VTX_READ_128 T48.XYZW, T35.X, 0, #1
; CM-NEXT: VTX_READ_128 T49.XYZW, T35.X, 16, #1
; CM-NEXT: VTX_READ_128 T50.XYZW, T35.X, 32, #1
; CM-NEXT: VTX_READ_128 T51.XYZW, T35.X, 48, #1
; CM-NEXT: ALU clause starting at 38:
; CM-NEXT: MOV * T35.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 39:
; CM-NEXT: LSHR * T40.W, T36.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT * T40.Z, T36.Y, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR T40.Y, T36.X, literal.x,
; CM-NEXT: LSHR * T41.W, T36.W, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT T40.X, T36.X, literal.x,
; CM-NEXT: AND_INT T41.Z, T36.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 224(3.138909e-43)
; CM-NEXT: LSHR T36.X, PV.W, literal.x,
; CM-NEXT: LSHR T41.Y, T36.Z, literal.y,
; CM-NEXT: LSHR * T42.W, T39.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T41.X, T36.Z, literal.x,
; CM-NEXT: AND_INT T42.Z, T39.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 240(3.363116e-43)
; CM-NEXT: LSHR T43.X, PV.W, literal.x,
; CM-NEXT: LSHR T42.Y, T39.X, literal.y,
; CM-NEXT: LSHR * T44.W, T39.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T42.X, T39.X, literal.x,
; CM-NEXT: AND_INT T44.Z, T39.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 192(2.690493e-43)
; CM-NEXT: LSHR T39.X, PV.W, literal.x,
; CM-NEXT: LSHR T44.Y, T39.Z, literal.y,
; CM-NEXT: LSHR * T45.W, T38.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T44.X, T39.Z, literal.x,
; CM-NEXT: AND_INT T45.Z, T38.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 208(2.914701e-43)
; CM-NEXT: LSHR T46.X, PV.W, literal.x,
; CM-NEXT: LSHR T45.Y, T38.X, literal.y,
; CM-NEXT: LSHR * T47.W, T38.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T45.X, T38.X, literal.x,
; CM-NEXT: AND_INT T47.Z, T38.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 160(2.242078e-43)
; CM-NEXT: LSHR T38.X, PV.W, literal.x,
; CM-NEXT: LSHR T47.Y, T38.Z, literal.y,
; CM-NEXT: LSHR * T35.W, T37.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T47.X, T38.Z, literal.x,
; CM-NEXT: AND_INT T35.Z, T37.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 176(2.466285e-43)
; CM-NEXT: ALU clause starting at 90:
; CM-NEXT: LSHR T52.X, T0.W, literal.x,
; CM-NEXT: LSHR T35.Y, T37.X, literal.y,
; CM-NEXT: LSHR * T53.W, T37.W, literal.y, BS:VEC_120/SCL_212
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T35.X, T37.X, literal.x,
; CM-NEXT: AND_INT T53.Z, T37.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 128(1.793662e-43)
; CM-NEXT: LSHR T37.X, PV.W, literal.x,
; CM-NEXT: LSHR T53.Y, T37.Z, literal.y,
; CM-NEXT: LSHR * T54.W, T51.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T53.X, T37.Z, literal.x,
; CM-NEXT: AND_INT T54.Z, T51.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 144(2.017870e-43)
; CM-NEXT: LSHR T55.X, PV.W, literal.x,
; CM-NEXT: LSHR T54.Y, T51.X, literal.y,
; CM-NEXT: LSHR * T56.W, T51.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T54.X, T51.X, literal.x,
; CM-NEXT: AND_INT T56.Z, T51.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 96(1.345247e-43)
; CM-NEXT: LSHR T51.X, PV.W, literal.x,
; CM-NEXT: LSHR T56.Y, T51.Z, literal.y,
; CM-NEXT: LSHR * T57.W, T50.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T56.X, T51.Z, literal.x,
; CM-NEXT: AND_INT T57.Z, T50.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 112(1.569454e-43)
; CM-NEXT: LSHR T58.X, PV.W, literal.x,
; CM-NEXT: LSHR T57.Y, T50.X, literal.y,
; CM-NEXT: LSHR * T59.W, T50.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T57.X, T50.X, literal.x,
; CM-NEXT: AND_INT T59.Z, T50.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 64(8.968310e-44)
; CM-NEXT: LSHR T50.X, PV.W, literal.x,
; CM-NEXT: LSHR T59.Y, T50.Z, literal.y,
; CM-NEXT: LSHR * T60.W, T49.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T59.X, T50.Z, literal.x,
; CM-NEXT: AND_INT T60.Z, T49.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 80(1.121039e-43)
; CM-NEXT: LSHR T61.X, PV.W, literal.x,
; CM-NEXT: LSHR T60.Y, T49.X, literal.y,
; CM-NEXT: LSHR * T62.W, T49.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T60.X, T49.X, literal.x,
; CM-NEXT: AND_INT T62.Z, T49.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 32(4.484155e-44)
; CM-NEXT: LSHR T49.X, PV.W, literal.x,
; CM-NEXT: LSHR T62.Y, T49.Z, literal.y,
; CM-NEXT: LSHR * T63.W, T48.Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T62.X, T49.Z, literal.x,
; CM-NEXT: AND_INT T63.Z, T48.Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 48(6.726233e-44)
; CM-NEXT: LSHR T64.X, PV.W, literal.x,
; CM-NEXT: LSHR T63.Y, T48.X, literal.y,
; CM-NEXT: LSHR * T65.W, T48.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T63.X, T48.X, literal.x,
; CM-NEXT: AND_INT * T65.Z, T48.W, literal.x,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR T48.X, KC0[2].Y, literal.x,
; CM-NEXT: LSHR * T65.Y, T48.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: AND_INT T65.X, T48.Z, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: LSHR * T66.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <64 x i16>, <64 x i16> addrspace(1)* %in
%ext = zext <64 x i16> %load to <64 x i32>
store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_sextload_v64i16_to_v64i32:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s3
; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:112
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:80
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:64
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v19
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v18
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v19, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v18, 0, 16
; GCN-NOHSA-SI-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v39, 16, v17
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v37, 16, v16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v38, v17, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v36, v16, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v19, 16, v23
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 16, v22
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v18, v23, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v22, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v43, 16, v21
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v41, 16, v20
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v42, v21, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v40, v20, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v23, 16, v27
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v21, 16, v26
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v22, v27, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v20, v26, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v47, 16, v25
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v45, 16, v24
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v46, v25, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v44, v24, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v27, 16, v31
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v25, 16, v30
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v26, v31, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v24, v30, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v51, 16, v29
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v49, 16, v28
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v50, v29, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v48, v28, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v31, 16, v15
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v29, 16, v14
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v30, v15, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v28, v14, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v55, 16, v13
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v53, 16, v12
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v54, v13, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v52, v12, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v15, 16, v11
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 16, v10
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v14, v11, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v10, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v59, 16, v9
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v57, 16, v8
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v58, v9, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v56, v8, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v11, 16, v7
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 16, v6
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v10, v7, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v6, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v63, 16, v5
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v61, 16, v4
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v62, v5, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v60, v4, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 16, v35
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 16, v34
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v35, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v34, 0, 16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 16, v33
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 16, v32
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v33, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v32, 0, 16
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[60:63], off, s[0:3], 0 offset:192
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:160
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:128
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_movk_i32 s8, 0x70
; GCN-HSA-NEXT: s_movk_i32 s9, 0x60
; GCN-HSA-NEXT: s_movk_i32 s10, 0x50
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, s8
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: s_add_u32 s4, s2, s9
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: s_add_u32 s4, s2, s10
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GCN-HSA-NEXT: s_add_u32 s4, s2, 64
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: s_add_u32 s6, s2, 48
; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6
; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17]
; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21]
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v29, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v28, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29]
; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33]
; GCN-HSA-NEXT: s_add_u32 s4, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v0
; GCN-HSA-NEXT: v_bfe_i32 v26, v1, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v24, v0, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v3
; GCN-HSA-NEXT: v_bfe_i32 v26, v3, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v2
; GCN-HSA-NEXT: v_bfe_i32 v24, v2, 0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[24:27]
; GCN-HSA-NEXT: s_waitcnt vmcnt(8)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v5
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4
; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v7
; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v6
; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0
; GCN-HSA-NEXT: s_waitcnt vmcnt(8)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v13
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v12
; GCN-HSA-NEXT: v_bfe_i32 v2, v13, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v12, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v9
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v8
; GCN-HSA-NEXT: v_bfe_i32 v2, v9, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v8, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v15
; GCN-HSA-NEXT: v_bfe_i32 v6, v15, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v14
; GCN-HSA-NEXT: v_bfe_i32 v4, v14, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v14, 16, v11
; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v12, 16, v10
; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[11:14]
; GCN-HSA-NEXT: s_waitcnt vmcnt(11)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v17
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v16
; GCN-HSA-NEXT: v_bfe_i32 v2, v17, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v16, 0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, s9
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v19
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v18
; GCN-HSA-NEXT: v_bfe_i32 v2, v19, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v18, 0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, s8
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, s10
; GCN-HSA-NEXT: s_waitcnt vmcnt(12)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v23
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v22
; GCN-HSA-NEXT: v_bfe_i32 v10, v23, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v8, v22, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-HSA-NEXT: s_waitcnt vmcnt(12)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v29
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v28
; GCN-HSA-NEXT: v_bfe_i32 v14, v29, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v12, v28, 0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v31
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v30
; GCN-HSA-NEXT: v_bfe_i32 v10, v31, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v8, v30, 0, 16
; GCN-HSA-NEXT: s_add_u32 s0, s0, 48
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v21
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v20
; GCN-HSA-NEXT: v_bfe_i32 v2, v21, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v20, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_waitcnt vmcnt(14)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v33
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v32
; GCN-HSA-NEXT: v_bfe_i32 v6, v33, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v4, v32, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v35
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v34
; GCN-HSA-NEXT: v_bfe_i32 v2, v35, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v34, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s90, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s91, 0xe80000
; GCN-NOHSA-VI-NEXT: s_add_u32 s88, s88, s3
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:96
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:80
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:64
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:48
; GCN-NOHSA-VI-NEXT: s_addc_u32 s89, s89, 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(6)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v59, 16, v1
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v57, 16, v0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(4)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v51, 16, v9
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v15
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v15, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 16, v14
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v14, 0, 16
; GCN-NOHSA-VI-NEXT: buffer_store_dword v28, off, s[88:91], 0 offset:4 ; 4-byte Folded Spill
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: buffer_store_dword v29, off, s[88:91], 0 offset:8 ; 4-byte Folded Spill
; GCN-NOHSA-VI-NEXT: buffer_store_dword v30, off, s[88:91], 0 offset:12 ; 4-byte Folded Spill
; GCN-NOHSA-VI-NEXT: buffer_store_dword v31, off, s[88:91], 0 offset:16 ; 4-byte Folded Spill
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[60:63], off, s[8:11], 0 offset:112
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v50, v9, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v58, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v56, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v31, 16, v19
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v30, v19, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v29, 16, v18
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v28, v18, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v39, 16, v17
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v38, v17, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v37, 16, v16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v36, v16, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 16, v23
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v23, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 16, v22
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v22, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v43, 16, v21
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v42, v21, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v41, 16, v20
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v40, v20, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v23, 16, v27
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v22, v27, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 16, v26
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v26, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v47, 16, v25
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v46, v25, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v45, 16, v24
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v44, v24, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v27, 16, v11
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v26, v11, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v25, 16, v10
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v24, v10, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v49, 16, v8
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v48, v8, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 16, v3
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v3, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 16, v2
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v35, 16, v13
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v34, v13, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v33, 16, v12
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v32, v12, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 16, v7
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v7, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 16, v6
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v6, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v55, 16, v5
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v54, v5, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v53, 16, v4
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v52, v4, 0, 16
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 16, v61
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 16, v60
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v61, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v60, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 16, v63
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 16, v62
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v63, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v62, 0, 16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:224
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:240
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[56:59], off, s[0:3], 0 offset:192
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[52:55], off, s[0:3], 0 offset:160
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:176
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:128
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:144
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:96
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:112
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:64
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: buffer_load_dword v0, off, s[88:91], 0 offset:4 ; 4-byte Folded Reload
; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[88:91], 0 offset:8 ; 4-byte Folded Reload
; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[88:91], 0 offset:12 ; 4-byte Folded Reload
; GCN-NOHSA-VI-NEXT: buffer_load_dword v3, off, s[88:91], 0 offset:16 ; 4-byte Folded Reload
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v64i16_to_v64i32:
; EG: ; %bb.0:
; EG-NEXT: ALU 18, @38, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 7 @22
; EG-NEXT: ALU 75, @57, KC0[CB0:0-32], KC1[]
; EG-NEXT: ALU 71, @133, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T41.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T65.XYZW, T66.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T56.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T64.XYZW, T55.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T54.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T63.XYZW, T53.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T52.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T62.XYZW, T51.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T50.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T61.XYZW, T49.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T40.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T60.XYZW, T39.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T38.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T59.XYZW, T37.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T58.XYZW, T36.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T57.XYZW, T35.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 22:
; EG-NEXT: VTX_READ_128 T42.XYZW, T41.X, 16, #1
; EG-NEXT: VTX_READ_128 T43.XYZW, T41.X, 32, #1
; EG-NEXT: VTX_READ_128 T44.XYZW, T41.X, 0, #1
; EG-NEXT: VTX_READ_128 T45.XYZW, T41.X, 48, #1
; EG-NEXT: VTX_READ_128 T46.XYZW, T41.X, 64, #1
; EG-NEXT: VTX_READ_128 T47.XYZW, T41.X, 80, #1
; EG-NEXT: VTX_READ_128 T48.XYZW, T41.X, 96, #1
; EG-NEXT: VTX_READ_128 T41.XYZW, T41.X, 112, #1
; EG-NEXT: ALU clause starting at 38:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T35.X, PV.W, literal.x,
; EG-NEXT: LSHR * T36.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T37.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; EG-NEXT: LSHR T38.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; EG-NEXT: LSHR T39.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
; EG-NEXT: LSHR T40.X, PV.W, literal.x,
; EG-NEXT: MOV * T41.X, KC0[2].Z,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ALU clause starting at 57:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T49.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
; EG-NEXT: LSHR T50.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
; EG-NEXT: LSHR T51.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
; EG-NEXT: LSHR T52.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
; EG-NEXT: LSHR T53.X, PV.W, literal.x,
; EG-NEXT: LSHR T0.Y, T41.Y, literal.y,
; EG-NEXT: LSHR T0.Z, T41.W, literal.y,
; EG-NEXT: LSHR T0.W, T48.Y, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 160(2.242078e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T54.X, PS, literal.x,
; EG-NEXT: LSHR T1.Y, T48.W, literal.y,
; EG-NEXT: LSHR T1.Z, T47.Y, literal.y,
; EG-NEXT: LSHR T1.W, T47.W, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: ADD_INT * T2.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 208(2.914701e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T55.X, PS, literal.x,
; EG-NEXT: LSHR T2.Y, T46.Y, literal.y,
; EG-NEXT: LSHR T2.Z, T46.W, literal.y,
; EG-NEXT: LSHR T2.W, T45.Y, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: ADD_INT * T3.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 192(2.690493e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T56.X, PS, literal.x,
; EG-NEXT: LSHR T3.Y, T45.W, literal.y,
; EG-NEXT: BFE_INT T57.Z, T44.W, 0.0, literal.y, BS:VEC_120/SCL_212
; EG-NEXT: LSHR T3.W, T43.Y, literal.y,
; EG-NEXT: LSHR * T4.W, T43.W, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: BFE_INT T57.X, T44.Z, 0.0, literal.x,
; EG-NEXT: LSHR T4.Y, T42.Y, literal.x,
; EG-NEXT: BFE_INT T58.Z, T44.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: LSHR T5.W, T42.W, literal.x,
; EG-NEXT: LSHR * T6.W, T44.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T58.X, T44.X, 0.0, literal.x,
; EG-NEXT: LSHR T5.Y, T44.Y, literal.x,
; EG-NEXT: BFE_INT T59.Z, T42.W, 0.0, literal.x,
; EG-NEXT: BFE_INT T57.W, PS, 0.0, literal.x,
; EG-NEXT: LSHR * T6.W, T44.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T59.X, T42.Z, 0.0, literal.x,
; EG-NEXT: BFE_INT T57.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T44.Z, T42.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T58.W, PV.Y, 0.0, literal.x,
; EG-NEXT: LSHR * T6.W, T44.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T44.X, T42.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T58.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T60.Z, T43.W, 0.0, literal.x,
; EG-NEXT: BFE_INT T59.W, T5.W, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: LSHR * T5.W, T42.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T60.X, T43.Z, 0.0, literal.x,
; EG-NEXT: BFE_INT T59.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T42.Z, T43.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T44.W, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: LSHR * T5.W, T42.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T42.X, T43.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T44.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T61.Z, T45.W, 0.0, literal.x,
; EG-NEXT: BFE_INT * T60.W, T4.W, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: ALU clause starting at 133:
; EG-NEXT: LSHR * T4.W, T43.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T61.X, T45.Z, 0.0, literal.x,
; EG-NEXT: BFE_INT T60.Y, PV.W, 0.0, literal.x,
; EG-NEXT: BFE_INT T43.Z, T45.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T42.W, T3.W, 0.0, literal.x,
; EG-NEXT: LSHR * T3.W, T43.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T43.X, T45.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T42.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T62.Z, T46.W, 0.0, literal.x,
; EG-NEXT: BFE_INT T61.W, T3.Y, 0.0, literal.x,
; EG-NEXT: LSHR * T3.W, T45.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T62.X, T46.Z, 0.0, literal.x,
; EG-NEXT: BFE_INT T61.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T45.Z, T46.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T43.W, T2.W, 0.0, literal.x,
; EG-NEXT: LSHR * T2.W, T45.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T45.X, T46.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T43.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T63.Z, T47.W, 0.0, literal.x,
; EG-NEXT: BFE_INT T62.W, T2.Z, 0.0, literal.x,
; EG-NEXT: LSHR * T2.W, T46.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T63.X, T47.Z, 0.0, literal.x,
; EG-NEXT: BFE_INT T62.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T46.Z, T47.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T45.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: LSHR * T2.W, T46.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T46.X, T47.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T45.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T64.Z, T48.W, 0.0, literal.x,
; EG-NEXT: BFE_INT T63.W, T1.W, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: LSHR * T1.W, T47.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T64.X, T48.Z, 0.0, literal.x,
; EG-NEXT: BFE_INT T63.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T47.Z, T48.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T46.W, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: LSHR * T1.W, T47.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T47.X, T48.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T46.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T65.Z, T41.W, 0.0, literal.x,
; EG-NEXT: BFE_INT T64.W, T1.Y, 0.0, literal.x,
; EG-NEXT: LSHR * T1.W, T48.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T65.X, T41.Z, 0.0, literal.x,
; EG-NEXT: BFE_INT T64.Y, PS, 0.0, literal.x,
; EG-NEXT: BFE_INT T48.Z, T41.Y, 0.0, literal.x,
; EG-NEXT: BFE_INT T47.W, T0.W, 0.0, literal.x,
; EG-NEXT: LSHR * T0.W, T48.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T48.X, T41.X, 0.0, literal.x,
; EG-NEXT: BFE_INT T47.Y, PS, 0.0, literal.x,
; EG-NEXT: LSHR T1.Z, T41.Z, literal.x,
; EG-NEXT: BFE_INT T65.W, T0.Z, 0.0, literal.x, BS:VEC_120/SCL_212
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 240(3.363116e-43)
; EG-NEXT: LSHR T66.X, PS, literal.x,
; EG-NEXT: BFE_INT T65.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: LSHR T0.Z, T41.X, literal.y,
; EG-NEXT: BFE_INT T48.W, T0.Y, 0.0, literal.y,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T41.X, PS, literal.x,
; EG-NEXT: BFE_INT * T48.Y, PV.Z, 0.0, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
;
; CM-LABEL: global_sextload_v64i16_to_v64i32:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @40, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 1 @24
; CM-NEXT: ALU 15, @41, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 5 @28
; CM-NEXT: ALU 82, @57, KC0[CB0:0-32], KC1[]
; CM-NEXT: ALU 72, @140, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T65, T66.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T36, T37.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T64, T56.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T55.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T63, T54.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T45, T53.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T62, T52.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T44, T51.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T61, T50.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T43, T49.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T60, T48.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T42, T47.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T59, T46.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T41, T40.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T58, T39.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T57, T38.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 24:
; CM-NEXT: VTX_READ_128 T36.XYZW, T35.X, 16, #1
; CM-NEXT: VTX_READ_128 T37.XYZW, T35.X, 0, #1
; CM-NEXT: Fetch clause starting at 28:
; CM-NEXT: VTX_READ_128 T41.XYZW, T35.X, 112, #1
; CM-NEXT: VTX_READ_128 T42.XYZW, T35.X, 96, #1
; CM-NEXT: VTX_READ_128 T43.XYZW, T35.X, 80, #1
; CM-NEXT: VTX_READ_128 T44.XYZW, T35.X, 64, #1
; CM-NEXT: VTX_READ_128 T45.XYZW, T35.X, 48, #1
; CM-NEXT: VTX_READ_128 T35.XYZW, T35.X, 32, #1
; CM-NEXT: ALU clause starting at 40:
; CM-NEXT: MOV * T35.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 41:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 224(3.138909e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T38.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 240(3.363116e-43)
; CM-NEXT: LSHR T39.X, PV.W, literal.x,
; CM-NEXT: LSHR T0.Y, T37.Z, literal.y,
; CM-NEXT: LSHR T0.Z, T37.W, literal.y,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 192(2.690493e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T40.X, PV.W, literal.x,
; CM-NEXT: LSHR T1.Y, T37.Y, literal.y,
; CM-NEXT: LSHR T1.Z, T36.Z, literal.y,
; CM-NEXT: LSHR * T0.W, T36.W, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: ALU clause starting at 57:
; CM-NEXT: LSHR T2.Z, T36.X, literal.x,
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 208(2.914701e-43)
; CM-NEXT: LSHR T46.X, PV.W, literal.x,
; CM-NEXT: LSHR T2.Y, T36.Y, literal.y,
; CM-NEXT: LSHR T3.Z, T35.Z, literal.y,
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 160(2.242078e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T47.X, PV.W, literal.x,
; CM-NEXT: LSHR T3.Y, T35.W, literal.y,
; CM-NEXT: LSHR T4.Z, T35.X, literal.y,
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 176(2.466285e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T48.X, PV.W, literal.x,
; CM-NEXT: LSHR T4.Y, T35.Y, literal.y,
; CM-NEXT: LSHR T5.Z, T45.Z, literal.y,
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 128(1.793662e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T49.X, PV.W, literal.x,
; CM-NEXT: LSHR T5.Y, T45.W, literal.y,
; CM-NEXT: LSHR T6.Z, T45.X, literal.y,
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 144(2.017870e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T50.X, PV.W, literal.x,
; CM-NEXT: LSHR T6.Y, T45.Y, literal.y,
; CM-NEXT: LSHR T7.Z, T44.Z, literal.y,
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 96(1.345247e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T51.X, PV.W, literal.x,
; CM-NEXT: LSHR T7.Y, T44.W, literal.y,
; CM-NEXT: LSHR T8.Z, T44.X, literal.y,
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 112(1.569454e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T52.X, PV.W, literal.x,
; CM-NEXT: LSHR T8.Y, T44.Y, literal.y,
; CM-NEXT: LSHR T9.Z, T43.Z, literal.y,
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 64(8.968310e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T53.X, PV.W, literal.x,
; CM-NEXT: LSHR T9.Y, T43.W, literal.y,
; CM-NEXT: LSHR T10.Z, T43.X, literal.y,
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 80(1.121039e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T54.X, PV.W, literal.x,
; CM-NEXT: LSHR T10.Y, T43.Y, literal.y,
; CM-NEXT: LSHR T11.Z, T42.Z, literal.y,
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T55.X, PV.W, literal.x,
; CM-NEXT: LSHR T11.Y, T42.W, literal.y,
; CM-NEXT: LSHR T12.Z, T42.X, literal.y,
; CM-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T56.X, PV.W, literal.x,
; CM-NEXT: LSHR T12.Y, T42.Y, literal.y,
; CM-NEXT: BFE_INT T57.Z, T41.Y, 0.0, literal.y, BS:VEC_120/SCL_212
; CM-NEXT: LSHR * T1.W, T41.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: BFE_INT T57.X, T41.X, 0.0, literal.x,
; CM-NEXT: LSHR T13.Y, T41.W, literal.x,
; CM-NEXT: BFE_INT T58.Z, T41.W, 0.0, literal.x,
; CM-NEXT: LSHR * T2.W, T41.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T58.X, T41.Z, 0.0, literal.x,
; CM-NEXT: LSHR T14.Y, T41.X, literal.x,
; CM-NEXT: BFE_INT T41.Z, T42.Y, 0.0, literal.x,
; CM-NEXT: BFE_INT * T57.W, PV.W, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T41.X, T42.X, 0.0, literal.x,
; CM-NEXT: BFE_INT T57.Y, PV.Y, 0.0, literal.x,
; CM-NEXT: BFE_INT T59.Z, T42.W, 0.0, literal.x,
; CM-NEXT: BFE_INT * T58.W, T13.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: ALU clause starting at 140:
; CM-NEXT: BFE_INT T59.X, T42.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T58.Y, T1.W, 0.0, literal.x,
; CM-NEXT: BFE_INT T42.Z, T43.Y, 0.0, literal.x,
; CM-NEXT: BFE_INT * T41.W, T12.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T42.X, T43.X, 0.0, literal.x,
; CM-NEXT: BFE_INT T41.Y, T12.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T60.Z, T43.W, 0.0, literal.x,
; CM-NEXT: BFE_INT * T59.W, T11.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T60.X, T43.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T59.Y, T11.Z, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: BFE_INT T43.Z, T44.Y, 0.0, literal.x,
; CM-NEXT: BFE_INT * T42.W, T10.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T43.X, T44.X, 0.0, literal.x,
; CM-NEXT: BFE_INT T42.Y, T10.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T61.Z, T44.W, 0.0, literal.x,
; CM-NEXT: BFE_INT * T60.W, T9.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T61.X, T44.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T60.Y, T9.Z, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: BFE_INT T44.Z, T45.Y, 0.0, literal.x,
; CM-NEXT: BFE_INT * T43.W, T8.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T44.X, T45.X, 0.0, literal.x,
; CM-NEXT: BFE_INT T43.Y, T8.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T62.Z, T45.W, 0.0, literal.x,
; CM-NEXT: BFE_INT * T61.W, T7.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T62.X, T45.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T61.Y, T7.Z, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: BFE_INT T45.Z, T35.Y, 0.0, literal.x,
; CM-NEXT: BFE_INT * T44.W, T6.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T45.X, T35.X, 0.0, literal.x,
; CM-NEXT: BFE_INT T44.Y, T6.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T63.Z, T35.W, 0.0, literal.x,
; CM-NEXT: BFE_INT * T62.W, T5.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T63.X, T35.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T62.Y, T5.Z, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: BFE_INT T35.Z, T36.Y, 0.0, literal.x,
; CM-NEXT: BFE_INT * T45.W, T4.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T35.X, T36.X, 0.0, literal.x,
; CM-NEXT: BFE_INT T45.Y, T4.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T64.Z, T36.W, 0.0, literal.x,
; CM-NEXT: BFE_INT * T63.W, T3.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T64.X, T36.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T63.Y, T3.Z, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: BFE_INT T36.Z, T37.Y, 0.0, literal.x,
; CM-NEXT: BFE_INT * T35.W, T2.Y, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T36.X, T37.X, 0.0, literal.x,
; CM-NEXT: BFE_INT T35.Y, T2.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T65.Z, T37.W, 0.0, literal.x,
; CM-NEXT: BFE_INT * T64.W, T0.W, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T65.X, T37.Z, 0.0, literal.x,
; CM-NEXT: BFE_INT T64.Y, T1.Z, 0.0, literal.x, BS:VEC_120/SCL_212
; CM-NEXT: LSHR T1.Z, T37.X, literal.x,
; CM-NEXT: BFE_INT * T36.W, T1.Y, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T37.X, KC0[2].Y, literal.x,
; CM-NEXT: BFE_INT T36.Y, PV.Z, 0.0, literal.y,
; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.y,
; CM-NEXT: BFE_INT * T65.W, T0.Z, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: LSHR T66.X, PV.Z, literal.x,
; CM-NEXT: BFE_INT * T65.Y, T0.Y, 0.0, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
%load = load <64 x i16>, <64 x i16> addrspace(1)* %in
%ext = sext <64 x i16> %load to <64 x i32>
store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_i16_to_i64:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: flat_load_ushort v2, v[2:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_i16_to_i64:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_i16_to_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MOV * T0.Y, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_zextload_i16_to_i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: MOV * T0.Y, 0.0,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%a = load i16, i16 addrspace(1)* %in
%ext = zext i16 %a to i64
store i64 %ext, i64 addrspace(1)* %out
ret void
}
; FIXME: Need to optimize this sequence to avoid extra bfe:
; t28: i32,ch = load<LD2[%in(addrspace=1)], anyext from i16> t12, t27, undef:i64
; t31: i64 = any_extend t28
; t33: i64 = sign_extend_inreg t31, ValueType:ch:i16
; TODO: These could be expanded earlier using ASHR 15
define amdgpu_kernel void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_sextload_i16_to_i64:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_sshort v0, off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: flat_load_sshort v2, v[2:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_i16_to_i64:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_i16_to_i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
;
; CM-LABEL: global_sextload_i16_to_i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
; CM-NEXT: ASHR * T0.Y, PV.X, literal.y,
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%a = load i16, i16 addrspace(1)* %in
%ext = sext i16 %a to i64
store i64 %ext, i64 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_v1i16_to_v1i64:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: flat_load_ushort v2, v[2:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v3, 0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v1i16_to_v1i64:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v1i16_to_v1i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MOV * T0.Y, 0.0,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_zextload_v1i16_to_v1i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: MOV * T0.Y, 0.0,
; CM-NEXT: LSHR * T1.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <1 x i16>, <1 x i16> addrspace(1)* %in
%ext = zext <1 x i16> %load to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
ret void
}
; TODO: These could be expanded earlier using ASHR 15
define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_sextload_v1i16_to_v1i64:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_sshort v0, off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: flat_load_sshort v2, v[2:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v1i16_to_v1i64:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3
; GCN-NOHSA-VI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v1i16_to_v1i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: BFE_INT T0.X, T0.X, 0.0, literal.x,
; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
; EG-NEXT: ASHR * T0.Y, PV.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
;
; CM-LABEL: global_sextload_v1i16_to_v1i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T0, T1.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_16 T0.X, T0.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T0.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: BFE_INT * T0.X, T0.X, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
; CM-NEXT: ASHR * T0.Y, PV.X, literal.y,
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%load = load <1 x i16>, <1 x i16> addrspace(1)* %in
%ext = sext <1 x i16> %load to <1 x i64>
store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_v2i16_to_v2i64:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dword v0, v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GCN-HSA-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v2i16_to_v2i64:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v2, off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, 0xffff, v2
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v2i16_to_v2i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 6, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: LSHR * T4.Z, T4.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T4.X, T4.X, literal.x,
; EG-NEXT: MOV T4.Y, 0.0,
; EG-NEXT: MOV T4.W, 0.0,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45)
;
; CM-LABEL: global_zextload_v2i16_to_v2i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 7, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T4.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: LSHR * T4.Z, T4.X, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT T4.X, T4.X, literal.x,
; CM-NEXT: MOV T4.Y, 0.0,
; CM-NEXT: MOV * T4.W, 0.0,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: LSHR * T5.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <2 x i16>, <2 x i16> addrspace(1)* %in
%ext = zext <2 x i16> %load to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_sextload_v2i16_to_v2i64:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dword v0, off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dword v0, v[0:1]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v2i16_to_v2i64:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dword v1, off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v2i16_to_v2i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.XYZW, T5.X, 1
; EG-NEXT: CF_END
; EG-NEXT: PAD
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T4.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: ASHR * T4.W, T4.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: ASHR * T4.Z, T4.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T4.X, T4.X, 0.0, literal.x,
; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 2(2.802597e-45)
; EG-NEXT: ASHR * T4.Y, PV.X, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
;
; CM-LABEL: global_sextload_v2i16_to_v2i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 8, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T4, T5.X
; CM-NEXT: CF_END
; CM-NEXT: PAD
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_32 T4.X, T4.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T4.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: ASHR * T4.W, T4.X, literal.x,
; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; CM-NEXT: ASHR * T4.Z, T4.X, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT * T4.X, T4.X, 0.0, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T5.X, KC0[2].Y, literal.x,
; CM-NEXT: ASHR * T4.Y, PV.X, literal.y,
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%load = load <2 x i16>, <2 x i16> addrspace(1)* %in
%ext = sext <2 x i16> %load to <2 x i64>
store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_v4i16_to_v4i64:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, 0xffff
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v9
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v8
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s2, v8
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s2, v9
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx2 v[8:9], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1
; GCN-HSA-NEXT: v_mov_b32_e32 v7, v1
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v9
; GCN-HSA-NEXT: v_and_b32_e32 v0, s4, v9
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v8
; GCN-HSA-NEXT: v_and_b32_e32 v4, s4, v8
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v4i16_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[8:9], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s6, v9
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v9
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s6, v8
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v8
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v4i16_to_v4i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 18, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T8.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T6.XYZW, T7.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MOV T2.X, T5.X,
; EG-NEXT: MOV * T3.X, T5.Y,
; EG-NEXT: MOV T0.Y, PV.X,
; EG-NEXT: MOV * T0.Z, PS,
; EG-NEXT: LSHR * T5.Z, PV.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T5.X, T0.Z, literal.x,
; EG-NEXT: MOV T5.Y, 0.0,
; EG-NEXT: LSHR T6.Z, T0.Y, literal.y,
; EG-NEXT: AND_INT * T6.X, T0.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T6.Y, 0.0,
; EG-NEXT: MOV T5.W, 0.0,
; EG-NEXT: MOV * T6.W, 0.0,
; EG-NEXT: LSHR T7.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR * T8.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_zextload_v4i16_to_v4i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T8.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T6, T7.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T5.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: MOV * T2.X, T5.X,
; CM-NEXT: MOV * T3.X, T5.Y,
; CM-NEXT: MOV T0.Y, PV.X,
; CM-NEXT: MOV * T0.Z, T2.X,
; CM-NEXT: LSHR * T5.Z, PV.Z, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT T5.X, T0.Z, literal.x,
; CM-NEXT: MOV T5.Y, 0.0,
; CM-NEXT: LSHR * T6.Z, T0.Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T6.X, T0.Y, literal.x,
; CM-NEXT: MOV T6.Y, 0.0,
; CM-NEXT: MOV * T5.W, 0.0,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV * T6.W, 0.0,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: LSHR * T7.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: LSHR * T8.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in
%ext = zext <4 x i16> %load to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_sextload_v4i16_to_v4i64:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx2 v[1:2], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v2
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[6:7], v[1:2], 48
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v3, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx2 v[1:2], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v2
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[1:2], 48
; GCN-HSA-NEXT: v_bfe_i32 v2, v4, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v4, v3, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v1, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v4i16_to_v4i64:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx2 v[1:2], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v2
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v3, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v4i16_to_v4i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
; EG-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T8.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.XYZW, T6.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 6:
; EG-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T5.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
; EG-NEXT: MOV T2.X, T5.X,
; EG-NEXT: MOV * T3.X, T5.Y,
; EG-NEXT: MOV T0.Y, PS,
; EG-NEXT: MOV * T0.Z, PV.X,
; EG-NEXT: ASHR * T5.W, PV.Z, literal.x,
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T6.X, KC0[2].Y, literal.x,
; EG-NEXT: ASHR T5.Z, T0.Z, literal.y,
; EG-NEXT: ASHR * T7.W, T0.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T5.X, T0.Z, 0.0, literal.x,
; EG-NEXT: ASHR * T7.Z, T0.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T7.X, T0.Y, 0.0, literal.x,
; EG-NEXT: ASHR T5.Y, PV.X, literal.y,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: LSHR T8.X, PV.W, literal.x,
; EG-NEXT: ASHR * T7.Y, PV.X, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
;
; CM-LABEL: global_sextload_v4i16_to_v4i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @6
; CM-NEXT: ALU 20, @9, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T8.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T5, T6.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 6:
; CM-NEXT: VTX_READ_64 T5.XY, T5.X, 0, #1
; CM-NEXT: ALU clause starting at 8:
; CM-NEXT: MOV * T5.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 9:
; CM-NEXT: MOV * T2.X, T5.X,
; CM-NEXT: MOV T3.X, T5.Y,
; CM-NEXT: MOV * T0.Y, PV.X,
; CM-NEXT: MOV * T0.Z, PV.X,
; CM-NEXT: ADD_INT T1.Z, KC0[2].Y, literal.x,
; CM-NEXT: ASHR * T5.W, PV.Z, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: LSHR T6.X, PV.Z, literal.x,
; CM-NEXT: ASHR T5.Z, T0.Z, literal.y,
; CM-NEXT: ASHR * T7.W, T0.Y, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T5.X, T0.Z, 0.0, literal.x,
; CM-NEXT: ASHR * T7.Z, T0.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T7.X, T0.Y, 0.0, literal.x,
; CM-NEXT: ASHR * T5.Y, PV.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
; CM-NEXT: ASHR * T7.Y, PV.X, literal.y,
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in
%ext = sext <4 x i16> %load to <4 x i64>
store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_v8i16_to_v8i64:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s7, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s6, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, 0xffff
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s7
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s3
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, 0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, 0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v9
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v9
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v9
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v9
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s4, s0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s5, s1
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v1
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v3
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s12, v0
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s12, v2
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s12, v1
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s12, v3
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[4:7], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v12, 0
; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff
; GCN-HSA-NEXT: v_mov_b32_e32 v14, v12
; GCN-HSA-NEXT: v_mov_b32_e32 v15, v12
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v8, v12
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v6, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v10, 0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v3
; GCN-HSA-NEXT: v_and_b32_e32 v11, s4, v3
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14]
; GCN-HSA-NEXT: v_mov_b32_e32 v4, v12
; GCN-HSA-NEXT: v_mov_b32_e32 v13, v12
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v1
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GCN-HSA-NEXT: v_and_b32_e32 v3, s4, v0
; GCN-HSA-NEXT: v_and_b32_e32 v12, s4, v1
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2
; GCN-HSA-NEXT: v_and_b32_e32 v7, s4, v2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15]
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10]
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i64:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, 0xffff
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v17
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v17
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v17
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s6, v3
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s6, v0
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s6, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s6, v2
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v8i16_to_v8i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
; EG-NEXT: ALU 30, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T8.XYZW, T14.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T9.XYZW, T13.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T12.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T7.XYZW, T11.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: LSHR * T8.Z, T7.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T8.X, T7.W, literal.x,
; EG-NEXT: MOV T8.Y, 0.0,
; EG-NEXT: LSHR T9.Z, T7.Z, literal.y,
; EG-NEXT: AND_INT * T9.X, T7.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T9.Y, 0.0,
; EG-NEXT: LSHR * T10.Z, T7.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T10.X, T7.Y, literal.x,
; EG-NEXT: MOV T10.Y, 0.0,
; EG-NEXT: LSHR T7.Z, T7.X, literal.y,
; EG-NEXT: AND_INT * T7.X, T7.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T7.Y, 0.0,
; EG-NEXT: MOV T8.W, 0.0,
; EG-NEXT: MOV * T9.W, 0.0,
; EG-NEXT: MOV T10.W, 0.0,
; EG-NEXT: MOV * T7.W, 0.0,
; EG-NEXT: LSHR T11.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR T12.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; EG-NEXT: LSHR T13.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; EG-NEXT: LSHR * T14.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_zextload_v8i16_to_v8i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @8
; CM-NEXT: ALU 32, @11, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T8, T14.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T9, T13.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T10, T12.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T11.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 8:
; CM-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; CM-NEXT: ALU clause starting at 10:
; CM-NEXT: MOV * T7.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 11:
; CM-NEXT: LSHR * T8.Z, T7.X, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT T8.X, T7.X, literal.x,
; CM-NEXT: MOV T8.Y, 0.0,
; CM-NEXT: LSHR * T9.Z, T7.Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T9.X, T7.Y, literal.x,
; CM-NEXT: MOV T9.Y, 0.0,
; CM-NEXT: LSHR * T10.Z, T7.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T10.X, T7.Z, literal.x,
; CM-NEXT: MOV T10.Y, 0.0,
; CM-NEXT: LSHR * T7.Z, T7.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T7.X, T7.W, literal.x,
; CM-NEXT: MOV T7.Y, 0.0,
; CM-NEXT: MOV * T8.W, 0.0,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV * T9.W, 0.0,
; CM-NEXT: MOV * T10.W, 0.0,
; CM-NEXT: MOV * T7.W, 0.0,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T11.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; CM-NEXT: LSHR T12.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: LSHR * T13.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: LSHR * T14.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <8 x i16>, <8 x i16> addrspace(1)* %in
%ext = zext <8 x i16> %load to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_sextload_v8i16_to_v8i64:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v3
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v4, v0, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[10:11], v[0:1], 48
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v12, v5, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[14:15], v[2:3], 48
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v2, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v6, v6, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v7, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16
; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[0:1], 48
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v11, v3
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v10, 16, v2
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16
; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[2:3], 48
; GCN-HSA-NEXT: v_bfe_i32 v0, v11, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v10, v10, 0, 16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[0:3]
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v8i16_to_v8i64:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, v3
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v2
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v11, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v3, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v1
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v5, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v7, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v10, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v8i16_to_v8i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @8
; EG-NEXT: ALU 33, @11, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T7.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T11.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T9.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T10.XYZW, T8.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 8:
; EG-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; EG-NEXT: ALU clause starting at 10:
; EG-NEXT: MOV * T7.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 11:
; EG-NEXT: LSHR T8.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR T9.X, PV.W, literal.x,
; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
; EG-NEXT: ASHR * T10.W, T7.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T11.X, PV.W, literal.x,
; EG-NEXT: ASHR T10.Z, T7.X, literal.y,
; EG-NEXT: ASHR * T12.W, T7.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T10.X, T7.X, 0.0, literal.x,
; EG-NEXT: ASHR T12.Z, T7.Y, literal.x,
; EG-NEXT: ASHR * T13.W, T7.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T12.X, T7.Y, 0.0, literal.x,
; EG-NEXT: ASHR T10.Y, PV.X, literal.y,
; EG-NEXT: ASHR T13.Z, T7.Z, literal.x,
; EG-NEXT: ASHR * T14.W, T7.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T13.X, T7.Z, 0.0, literal.x,
; EG-NEXT: ASHR T12.Y, PV.X, literal.y,
; EG-NEXT: ASHR * T14.Z, T7.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T14.X, T7.W, 0.0, literal.x,
; EG-NEXT: ASHR T13.Y, PV.X, literal.y,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T7.X, PV.W, literal.x,
; EG-NEXT: ASHR * T14.Y, PV.X, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
;
; CM-LABEL: global_sextload_v8i16_to_v8i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @8
; CM-NEXT: ALU 33, @11, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T7, T14.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T13, T11.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T9.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T10, T8.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 8:
; CM-NEXT: VTX_READ_128 T7.XYZW, T7.X, 0, #1
; CM-NEXT: ALU clause starting at 10:
; CM-NEXT: MOV * T7.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 11:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T8.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; CM-NEXT: LSHR T9.X, PV.W, literal.x,
; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y,
; CM-NEXT: ASHR * T10.W, T7.W, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T11.X, PV.Z, literal.x,
; CM-NEXT: ASHR T10.Z, T7.W, literal.y,
; CM-NEXT: ASHR * T12.W, T7.Z, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T10.X, T7.W, 0.0, literal.x,
; CM-NEXT: ASHR T12.Z, T7.Z, literal.x,
; CM-NEXT: ASHR * T13.W, T7.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T12.X, T7.Z, 0.0, literal.x,
; CM-NEXT: ASHR T10.Y, PV.X, literal.y,
; CM-NEXT: ASHR T13.Z, T7.Y, literal.x,
; CM-NEXT: ASHR * T7.W, T7.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T13.X, T7.Y, 0.0, literal.x,
; CM-NEXT: ASHR T12.Y, PV.X, literal.y,
; CM-NEXT: ASHR * T7.Z, T7.X, literal.x,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T7.X, T7.X, 0.0, literal.x,
; CM-NEXT: ASHR * T13.Y, PV.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: LSHR T14.X, KC0[2].Y, literal.x,
; CM-NEXT: ASHR * T7.Y, PV.X, literal.y,
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%load = load <8 x i16>, <8 x i16> addrspace(1)* %in
%ext = sext <8 x i16> %load to <8 x i64>
store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_v16i16_to_v16i64:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, s0, v0
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, s0, v2
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v1
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v3
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, 0
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v5
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v6
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, s0, v6
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v4
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v7
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v7
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, s0, v5
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v20
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v20
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v20
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v20
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v20
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v20
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v20
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v20
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v20
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v20
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v20
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(1)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, 0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, 0
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, 0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0
; GCN-HSA-NEXT: s_mov_b32 s6, 0xffff
; GCN-HSA-NEXT: v_mov_b32_e32 v10, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: s_add_u32 s4, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2
; GCN-HSA-NEXT: s_waitcnt vmcnt(0)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5
; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v5
; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12]
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1
; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v1
; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3
; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v3
; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12]
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 64
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v7
; GCN-HSA-NEXT: v_and_b32_e32 v7, s6, v7
; GCN-HSA-NEXT: v_mov_b32_e32 v14, 0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6
; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v6
; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[7:10]
; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v1, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v14
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v0
; GCN-HSA-NEXT: v_and_b32_e32 v6, s6, v0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v2
; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v2
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v4
; GCN-HSA-NEXT: v_and_b32_e32 v0, s6, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v9, v14
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[0:3]
; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[6:9]
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[11:14]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v28, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v30, 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v28
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v28
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v28
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v28
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v28
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v0
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v0, s0, v2
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v1
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v3
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v3
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s0, v4
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s0, v5
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v3, s0, v6
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v27, s0, v7
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v29, 16, v7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v4
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v5
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v6
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v28
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v6, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v1
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:96
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v28
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v3, 0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v16i16_to_v16i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @12
; EG-NEXT: ALU 62, @17, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T13.XYZW, T26.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T14.XYZW, T25.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T15.XYZW, T24.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T12.XYZW, T23.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T16.XYZW, T22.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T17.XYZW, T21.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T18.XYZW, T20.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T19.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 12:
; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
; EG-NEXT: ALU clause starting at 16:
; EG-NEXT: MOV * T11.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 17:
; EG-NEXT: LSHR * T13.Z, T12.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T13.X, T12.W, literal.x,
; EG-NEXT: MOV T13.Y, 0.0,
; EG-NEXT: LSHR T14.Z, T12.Z, literal.y,
; EG-NEXT: AND_INT * T14.X, T12.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T14.Y, 0.0,
; EG-NEXT: LSHR * T15.Z, T12.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T15.X, T12.Y, literal.x,
; EG-NEXT: MOV T15.Y, 0.0,
; EG-NEXT: LSHR T12.Z, T12.X, literal.y,
; EG-NEXT: AND_INT * T12.X, T12.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T12.Y, 0.0,
; EG-NEXT: LSHR * T16.Z, T11.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T16.X, T11.W, literal.x,
; EG-NEXT: MOV T16.Y, 0.0,
; EG-NEXT: LSHR T17.Z, T11.Z, literal.y,
; EG-NEXT: AND_INT * T17.X, T11.Z, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T17.Y, 0.0,
; EG-NEXT: LSHR * T18.Z, T11.Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T18.X, T11.Y, literal.x,
; EG-NEXT: MOV T18.Y, 0.0,
; EG-NEXT: LSHR T11.Z, T11.X, literal.y,
; EG-NEXT: AND_INT * T11.X, T11.X, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T11.Y, 0.0,
; EG-NEXT: MOV T13.W, 0.0,
; EG-NEXT: MOV * T14.W, 0.0,
; EG-NEXT: MOV T15.W, 0.0,
; EG-NEXT: MOV * T12.W, 0.0,
; EG-NEXT: MOV T16.W, 0.0,
; EG-NEXT: MOV * T17.W, 0.0,
; EG-NEXT: MOV T18.W, 0.0,
; EG-NEXT: MOV * T11.W, 0.0,
; EG-NEXT: LSHR T19.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR T20.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; EG-NEXT: LSHR T21.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; EG-NEXT: LSHR T22.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
; EG-NEXT: LSHR T23.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; EG-NEXT: LSHR T24.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
; EG-NEXT: LSHR T25.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
; EG-NEXT: LSHR * T26.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_zextload_v16i16_to_v16i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 1 @12
; CM-NEXT: ALU 64, @17, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T13, T26.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T14, T25.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T15, T24.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T23.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T16, T22.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T17, T21.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T18, T20.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T19.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 12:
; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
; CM-NEXT: ALU clause starting at 16:
; CM-NEXT: MOV * T11.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 17:
; CM-NEXT: LSHR * T13.Z, T12.X, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT T13.X, T12.X, literal.x,
; CM-NEXT: MOV T13.Y, 0.0,
; CM-NEXT: LSHR * T14.Z, T12.Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T14.X, T12.Y, literal.x,
; CM-NEXT: MOV T14.Y, 0.0,
; CM-NEXT: LSHR * T15.Z, T12.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T15.X, T12.Z, literal.x,
; CM-NEXT: MOV T15.Y, 0.0,
; CM-NEXT: LSHR * T12.Z, T12.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T12.X, T12.W, literal.x,
; CM-NEXT: MOV T12.Y, 0.0,
; CM-NEXT: LSHR * T16.Z, T11.X, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T16.X, T11.X, literal.x,
; CM-NEXT: MOV T16.Y, 0.0,
; CM-NEXT: LSHR * T17.Z, T11.Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T17.X, T11.Y, literal.x,
; CM-NEXT: MOV T17.Y, 0.0,
; CM-NEXT: LSHR * T18.Z, T11.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T18.X, T11.Z, literal.x,
; CM-NEXT: MOV T18.Y, 0.0,
; CM-NEXT: LSHR * T11.Z, T11.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T11.X, T11.W, literal.x,
; CM-NEXT: MOV T11.Y, 0.0,
; CM-NEXT: MOV * T13.W, 0.0,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV * T14.W, 0.0,
; CM-NEXT: MOV * T15.W, 0.0,
; CM-NEXT: MOV * T12.W, 0.0,
; CM-NEXT: MOV * T16.W, 0.0,
; CM-NEXT: MOV * T17.W, 0.0,
; CM-NEXT: MOV * T18.W, 0.0,
; CM-NEXT: MOV * T11.W, 0.0,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 112(1.569454e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T19.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 96(1.345247e-43)
; CM-NEXT: LSHR T20.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; CM-NEXT: LSHR T21.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 64(8.968310e-44)
; CM-NEXT: LSHR T22.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; CM-NEXT: LSHR T23.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; CM-NEXT: LSHR T24.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: LSHR * T25.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: LSHR * T26.X, KC0[2].Y, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <16 x i16>, <16 x i16> addrspace(1)* %in
%ext = zext <16 x i16> %load to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_sextload_v16i16_to_v16i64:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, v7
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v3
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v4
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v15, 16, v0
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v8, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[10:11], v[6:7], 48
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[9:10], v[4:5], 48
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v5, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v0, 0, 16
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v15, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v8, v12, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[10:11], v[2:3], 48
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[11:12], v[0:1], 48
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v2, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v14, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v13, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v4, 0, 16
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v6
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v6, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0
; GCN-HSA-NEXT: s_waitcnt vmcnt(1)
; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16
; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[0:1], 48
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11]
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
; GCN-HSA-NEXT: v_bfe_i32 v8, v1, 0, 16
; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[2:3], 48
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 32
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v10, v1, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[0:3]
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
; GCN-HSA-NEXT: v_bfe_i32 v0, v5, 0, 16
; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[4:5], 48
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
; GCN-HSA-NEXT: s_add_u32 s0, s0, 64
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v6
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_mov_b32_e32 v11, v7
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GCN-HSA-NEXT: v_bfe_i32 v10, v8, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v8, v6, 0, 16
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v4, v11, 0, 16
; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[6:7], 48
; GCN-HSA-NEXT: v_bfe_i32 v2, v1, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7]
; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11]
; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v16i16_to_v16i64:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(1)
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v0, 0, 16
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v7
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v9, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v7, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:112
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v6, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v6, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:96
; GCN-NOHSA-VI-NEXT: s_nop 0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v9, v5, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v5, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:80
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v1, v3
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v4, v4, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v0, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v3, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v16i16_to_v16i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 1 @12
; EG-NEXT: ALU 65, @17, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T12.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T20.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T11.XYZW, T18.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T17.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T16.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T15.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T14.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T13.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 12:
; EG-NEXT: VTX_READ_128 T12.XYZW, T11.X, 16, #1
; EG-NEXT: VTX_READ_128 T11.XYZW, T11.X, 0, #1
; EG-NEXT: ALU clause starting at 16:
; EG-NEXT: MOV * T11.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 17:
; EG-NEXT: LSHR T13.X, KC0[2].Y, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: LSHR T14.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; EG-NEXT: LSHR T15.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; EG-NEXT: LSHR T16.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
; EG-NEXT: LSHR T17.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; EG-NEXT: LSHR T18.X, PV.W, literal.x,
; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
; EG-NEXT: ASHR * T19.W, T11.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T20.X, PV.W, literal.x,
; EG-NEXT: ASHR T19.Z, T11.X, literal.y,
; EG-NEXT: ASHR * T21.W, T11.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T19.X, T11.X, 0.0, literal.x,
; EG-NEXT: ASHR T21.Z, T11.Y, literal.x,
; EG-NEXT: ASHR * T22.W, T11.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T21.X, T11.Y, 0.0, literal.x,
; EG-NEXT: ASHR T19.Y, PV.X, literal.y,
; EG-NEXT: ASHR T22.Z, T11.Z, literal.x,
; EG-NEXT: ASHR * T23.W, T11.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T22.X, T11.Z, 0.0, literal.x,
; EG-NEXT: ASHR T21.Y, PV.X, literal.y,
; EG-NEXT: ASHR T23.Z, T11.W, literal.x,
; EG-NEXT: ASHR * T24.W, T12.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T23.X, T11.W, 0.0, literal.x,
; EG-NEXT: ASHR T22.Y, PV.X, literal.y,
; EG-NEXT: ASHR T24.Z, T12.X, literal.x,
; EG-NEXT: ASHR * T11.W, T12.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T24.X, T12.X, 0.0, literal.x,
; EG-NEXT: ASHR T23.Y, PV.X, literal.y,
; EG-NEXT: ASHR T11.Z, T12.Y, literal.x,
; EG-NEXT: ASHR * T25.W, T12.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T11.X, T12.Y, 0.0, literal.x,
; EG-NEXT: ASHR T24.Y, PV.X, literal.y,
; EG-NEXT: ASHR T25.Z, T12.Z, literal.x,
; EG-NEXT: ASHR * T26.W, T12.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T25.X, T12.Z, 0.0, literal.x,
; EG-NEXT: ASHR T11.Y, PV.X, literal.y,
; EG-NEXT: ASHR * T26.Z, T12.W, literal.x,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T26.X, T12.W, 0.0, literal.x,
; EG-NEXT: ASHR T25.Y, PV.X, literal.y,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: 112(1.569454e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T12.X, PV.W, literal.x,
; EG-NEXT: ASHR * T26.Y, PV.X, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
;
; CM-LABEL: global_sextload_v16i16_to_v16i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @16, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 1 @12
; CM-NEXT: ALU 65, @17, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T12, T26.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T25, T20.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T24, T18.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T17.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T11, T16.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T22, T15.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T14.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T13.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 12:
; CM-NEXT: VTX_READ_128 T12.XYZW, T11.X, 0, #1
; CM-NEXT: VTX_READ_128 T11.XYZW, T11.X, 16, #1
; CM-NEXT: ALU clause starting at 16:
; CM-NEXT: MOV * T11.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 17:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 112(1.569454e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T13.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 96(1.345247e-43)
; CM-NEXT: LSHR T14.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; CM-NEXT: LSHR T15.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 64(8.968310e-44)
; CM-NEXT: LSHR T16.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; CM-NEXT: LSHR T17.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; CM-NEXT: LSHR T18.X, PV.W, literal.x,
; CM-NEXT: ADD_INT T0.Z, KC0[2].Y, literal.y,
; CM-NEXT: ASHR * T19.W, T11.W, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; CM-NEXT: LSHR T20.X, PV.Z, literal.x,
; CM-NEXT: ASHR T19.Z, T11.W, literal.y,
; CM-NEXT: ASHR * T21.W, T11.Z, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T19.X, T11.W, 0.0, literal.x,
; CM-NEXT: ASHR T21.Z, T11.Z, literal.x,
; CM-NEXT: ASHR * T22.W, T11.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T21.X, T11.Z, 0.0, literal.x,
; CM-NEXT: ASHR T19.Y, PV.X, literal.y,
; CM-NEXT: ASHR T22.Z, T11.Y, literal.x,
; CM-NEXT: ASHR * T11.W, T11.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T22.X, T11.Y, 0.0, literal.x,
; CM-NEXT: ASHR T21.Y, PV.X, literal.y,
; CM-NEXT: ASHR T11.Z, T11.X, literal.x,
; CM-NEXT: ASHR * T23.W, T12.W, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T11.X, T11.X, 0.0, literal.x,
; CM-NEXT: ASHR T22.Y, PV.X, literal.y,
; CM-NEXT: ASHR T23.Z, T12.W, literal.x,
; CM-NEXT: ASHR * T24.W, T12.Z, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T23.X, T12.W, 0.0, literal.x,
; CM-NEXT: ASHR T11.Y, PV.X, literal.y,
; CM-NEXT: ASHR T24.Z, T12.Z, literal.x,
; CM-NEXT: ASHR * T25.W, T12.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T24.X, T12.Z, 0.0, literal.x,
; CM-NEXT: ASHR T23.Y, PV.X, literal.y,
; CM-NEXT: ASHR T25.Z, T12.Y, literal.x,
; CM-NEXT: ASHR * T12.W, T12.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T25.X, T12.Y, 0.0, literal.x,
; CM-NEXT: ASHR T24.Y, PV.X, literal.y,
; CM-NEXT: ASHR * T12.Z, T12.X, literal.x,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T12.X, T12.X, 0.0, literal.x,
; CM-NEXT: ASHR * T25.Y, PV.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: LSHR T26.X, KC0[2].Y, literal.x,
; CM-NEXT: ASHR * T12.Y, PV.X, literal.y,
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%load = load <16 x i16>, <16 x i16> addrspace(1)* %in
%ext = sext <16 x i16> %load to <16 x i64>
store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_zextload_v32i16_to_v32i64:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s14, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s15, 0xe8f000
; GCN-NOHSA-SI-NEXT: s_add_u32 s12, s12, s3
; GCN-NOHSA-SI-NEXT: s_addc_u32 s13, s13, 0
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, 0xffff
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, 0
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[2:5], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[6:9], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[10:13], off, s[8:11], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[14:17], off, s[8:11], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v23, 16, v3
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v4
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v2
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v2
; GCN-NOHSA-SI-NEXT: buffer_store_dword v18, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(2)
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, s0, v4
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dword v19, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dword v20, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: buffer_store_dword v21, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: buffer_store_dword v22, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, s0, v3
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v32, 16, v5
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v30, s0, v5
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v36, 16, v6
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v34, s0, v6
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v28, 16, v8
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v26, s0, v8
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v40, 16, v7
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v38, s0, v7
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v44, 16, v9
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v42, s0, v9
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v48, 16, v10
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v46, s0, v10
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v24, 16, v12
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v22, s0, v12
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v52, 16, v11
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v50, s0, v11
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v56, 16, v13
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v54, s0, v13
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v17
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v60, 16, v14
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v58, s0, v14
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v20, 16, v16
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v18, s0, v16
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v15
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, s0, v15
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, s0, v17
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v55, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v57, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v51, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v53, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v43, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v45, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v39, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v41, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v33, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v6, v23
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v1
; GCN-NOHSA-SI-NEXT: buffer_store_dword v4, off, s[12:15], 0 offset:36 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:40 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:44 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: buffer_store_dword v7, off, s[12:15], 0 offset:48 ; 4-byte Folded Spill
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v59, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v47, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v1
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v35, v1
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v5, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v6, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v7, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v1
; GCN-NOHSA-SI-NEXT: buffer_load_dword v12, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v13, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v14, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v15, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:240
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v61, 0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, 0
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v0, v12
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v13
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v2, v14
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, 0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, 0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v37, 0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, 0
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v49, 0
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:208
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:144
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(5)
; GCN-NOHSA-SI-NEXT: buffer_load_dword v8, off, s[12:15], 0 offset:36 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v9, off, s[12:15], 0 offset:40 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v10, off, s[12:15], 0 offset:44 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: buffer_load_dword v11, off, s[12:15], 0 offset:48 ; 4-byte Folded Reload
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:224
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:160
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[26:29], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
; GCN-HSA-NEXT: s_mov_b32 s16, 0xffff
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s4
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4
; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[6:7]
; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[10:11]
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
; GCN-HSA-NEXT: s_add_u32 s2, s2, 48
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2
; GCN-HSA-NEXT: flat_load_dwordx4 v[14:17], v[14:15]
; GCN-HSA-NEXT: flat_load_dwordx4 v[18:21], v[18:19]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: s_add_u32 s4, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xf0
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
; GCN-HSA-NEXT: s_add_u32 s8, s0, 0xd0
; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
; GCN-HSA-NEXT: s_add_u32 s10, s0, 0xb0
; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x90
; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0
; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70
; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14
; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x50
; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v9
; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v9
; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5]
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v7
; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v7
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14
; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5]
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s11
; GCN-HSA-NEXT: s_waitcnt vmcnt(4)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v13
; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v13
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s10
; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5]
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s13
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v11
; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v11
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s12
; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5]
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4
; GCN-HSA-NEXT: s_waitcnt vmcnt(4)
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v19
; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v19
; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5]
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s7
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v17
; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v17
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s6
; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5]
; GCN-HSA-NEXT: s_add_u32 s4, s0, 32
; GCN-HSA-NEXT: v_mov_b32_e32 v23, s9
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v15
; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v15
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s8
; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5]
; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xe0
; GCN-HSA-NEXT: v_mov_b32_e32 v7, 0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v18
; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v18
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s0
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[2:5]
; GCN-HSA-NEXT: v_and_b32_e32 v0, s16, v21
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v16
; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v16
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s6
; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xc0
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[2:5]
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v14
; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v14
; GCN-HSA-NEXT: v_mov_b32_e32 v14, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s6
; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xa0
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[2:5]
; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v12
; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v12
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v11, s6
; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x80
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[2:5]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v10
; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v10
; GCN-HSA-NEXT: v_mov_b32_e32 v10, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6
; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[2:5]
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v21
; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[0:3]
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v6
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4
; GCN-HSA-NEXT: v_and_b32_e32 v9, s16, v6
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v20
; GCN-HSA-NEXT: v_and_b32_e32 v4, s16, v20
; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5
; GCN-HSA-NEXT: s_add_u32 s0, s0, 64
; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7]
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v8
; GCN-HSA-NEXT: v_and_b32_e32 v12, s16, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v13, v1
; GCN-HSA-NEXT: v_mov_b32_e32 v15, v7
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[12:15]
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
; GCN-HSA-NEXT: v_mov_b32_e32 v12, v7
; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:48
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, 0xffff
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v57, 0
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v11, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v15, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v27, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v23, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v19, 0
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(3)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v8, s0, v1
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v10, 16, v1
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v48, s0, v36
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v50, 16, v36
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v54, s0, v38
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v56, 16, v38
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v36, s0, v37
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v38, 16, v37
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v37, 0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v1, s0, v3
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v12, s0, v0
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v16, s0, v2
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v20, s0, v5
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v24, s0, v4
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v26, 16, v4
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v4, s0, v6
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v28, s0, v7
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v39, s0, v32
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v41, 16, v32
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v32, s0, v34
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v42, s0, v31
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v45, s0, v33
; GCN-NOHSA-VI-NEXT: v_and_b32_e32 v51, s0, v35
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v55, v37
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v53, 16, v35
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:240
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v52, v37
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v54, 0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:192
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v49, v37
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v51, 0
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v47, 16, v33
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:208
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v46, v37
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v48, 0
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v22, 16, v5
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v30, 16, v7
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v44, 16, v31
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:160
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v6
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v45, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v43, v37
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v7, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v5, v37
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v18, 16, v2
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:128
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v0
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v34, 16, v34
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v35, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v33, v37
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v42, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v40, v37
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v31, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v29, v37
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v2, v37
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v13, v37
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v9, v37
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:176
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v39, 0
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v25, v37
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v21, v37
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v17, v37
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:64
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:224
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_zextload_v32i16_to_v32i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @30, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 2 @22
; EG-NEXT: ALU 33, @31, KC0[], KC1[]
; EG-NEXT: TEX 0 @28
; EG-NEXT: ALU 93, @65, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T23.XYZW, T50.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T24.XYZW, T49.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T25.XYZW, T48.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T47.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T26.XYZW, T46.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T27.XYZW, T45.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T28.XYZW, T44.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T22.XYZW, T43.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T42.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T30.XYZW, T41.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T31.XYZW, T40.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T21.XYZW, T39.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T32.XYZW, T38.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T33.XYZW, T37.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T34.XYZW, T36.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T29.XYZW, T35.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 22:
; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 48, #1
; EG-NEXT: VTX_READ_128 T21.XYZW, T19.X, 16, #1
; EG-NEXT: VTX_READ_128 T22.XYZW, T19.X, 32, #1
; EG-NEXT: Fetch clause starting at 28:
; EG-NEXT: VTX_READ_128 T29.XYZW, T19.X, 0, #1
; EG-NEXT: ALU clause starting at 30:
; EG-NEXT: MOV * T19.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 31:
; EG-NEXT: LSHR * T23.Z, T20.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T23.X, T20.Z, literal.x,
; EG-NEXT: MOV T23.Y, 0.0,
; EG-NEXT: LSHR T24.Z, T20.W, literal.y,
; EG-NEXT: AND_INT * T24.X, T20.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T24.Y, 0.0,
; EG-NEXT: LSHR * T25.Z, T20.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T25.X, T20.X, literal.x,
; EG-NEXT: MOV T25.Y, 0.0,
; EG-NEXT: LSHR T20.Z, T20.Y, literal.y,
; EG-NEXT: AND_INT * T20.X, T20.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T20.Y, 0.0,
; EG-NEXT: LSHR * T26.Z, T22.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T26.X, T22.Z, literal.x,
; EG-NEXT: MOV T26.Y, 0.0,
; EG-NEXT: LSHR T27.Z, T22.W, literal.y,
; EG-NEXT: AND_INT * T27.X, T22.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T27.Y, 0.0,
; EG-NEXT: LSHR * T28.Z, T22.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T28.X, T22.X, literal.x,
; EG-NEXT: MOV T28.Y, 0.0,
; EG-NEXT: LSHR T22.Z, T22.Y, literal.y,
; EG-NEXT: AND_INT * T22.X, T22.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T22.Y, 0.0,
; EG-NEXT: LSHR * T19.Z, T21.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: ALU clause starting at 65:
; EG-NEXT: AND_INT T19.X, T21.Z, literal.x,
; EG-NEXT: MOV T19.Y, 0.0,
; EG-NEXT: LSHR T30.Z, T21.W, literal.y,
; EG-NEXT: AND_INT * T30.X, T21.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T30.Y, 0.0,
; EG-NEXT: LSHR * T31.Z, T21.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T31.X, T21.X, literal.x,
; EG-NEXT: MOV T31.Y, 0.0,
; EG-NEXT: LSHR T21.Z, T21.Y, literal.y,
; EG-NEXT: AND_INT * T21.X, T21.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T21.Y, 0.0,
; EG-NEXT: LSHR * T32.Z, T29.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T32.X, T29.Z, literal.x,
; EG-NEXT: MOV T32.Y, 0.0,
; EG-NEXT: LSHR T33.Z, T29.W, literal.y,
; EG-NEXT: AND_INT * T33.X, T29.W, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T33.Y, 0.0,
; EG-NEXT: LSHR * T34.Z, T29.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T34.X, T29.X, literal.x,
; EG-NEXT: MOV T34.Y, 0.0,
; EG-NEXT: LSHR T29.Z, T29.Y, literal.y,
; EG-NEXT: AND_INT * T29.X, T29.Y, literal.x,
; EG-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; EG-NEXT: MOV T29.Y, 0.0,
; EG-NEXT: MOV T23.W, 0.0,
; EG-NEXT: MOV * T24.W, 0.0,
; EG-NEXT: MOV T25.W, 0.0,
; EG-NEXT: MOV * T20.W, 0.0,
; EG-NEXT: MOV T26.W, 0.0,
; EG-NEXT: MOV * T27.W, 0.0,
; EG-NEXT: MOV T28.W, 0.0,
; EG-NEXT: MOV * T22.W, 0.0,
; EG-NEXT: MOV T19.W, 0.0,
; EG-NEXT: MOV * T30.W, 0.0,
; EG-NEXT: MOV T31.W, 0.0,
; EG-NEXT: MOV * T21.W, 0.0,
; EG-NEXT: MOV T32.W, 0.0,
; EG-NEXT: MOV * T33.W, 0.0,
; EG-NEXT: MOV T34.W, 0.0,
; EG-NEXT: MOV * T29.W, 0.0,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T35.X, PV.W, literal.x,
; EG-NEXT: LSHR * T36.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T37.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; EG-NEXT: LSHR T38.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; EG-NEXT: LSHR T39.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
; EG-NEXT: LSHR T40.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
; EG-NEXT: LSHR T41.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
; EG-NEXT: LSHR T42.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
; EG-NEXT: LSHR T43.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
; EG-NEXT: LSHR T44.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
; EG-NEXT: LSHR T45.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
; EG-NEXT: LSHR T46.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
; EG-NEXT: LSHR T47.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
; EG-NEXT: LSHR T48.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
; EG-NEXT: LSHR T49.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 224(3.138909e-43)
; EG-NEXT: LSHR * T50.X, PV.W, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
;
; CM-LABEL: global_zextload_v32i16_to_v32i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @30, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 2 @22
; CM-NEXT: ALU 33, @31, KC0[], KC1[]
; CM-NEXT: TEX 0 @28
; CM-NEXT: ALU 94, @65, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T23, T50.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T24, T49.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T25, T48.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T26, T47.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T46.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T27, T45.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T28, T44.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T29, T43.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T42.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T30, T41.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T31, T40.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T32, T39.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T21, T38.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T33, T37.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T34, T36.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T22.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 22:
; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
; CM-NEXT: VTX_READ_128 T21.XYZW, T19.X, 32, #1
; CM-NEXT: VTX_READ_128 T22.XYZW, T19.X, 16, #1
; CM-NEXT: Fetch clause starting at 28:
; CM-NEXT: VTX_READ_128 T22.XYZW, T19.X, 48, #1
; CM-NEXT: ALU clause starting at 30:
; CM-NEXT: MOV * T19.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 31:
; CM-NEXT: LSHR * T23.Z, T20.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: AND_INT T23.X, T20.Y, literal.x,
; CM-NEXT: MOV T23.Y, 0.0,
; CM-NEXT: LSHR * T24.Z, T20.X, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T24.X, T20.X, literal.x,
; CM-NEXT: MOV T24.Y, 0.0,
; CM-NEXT: LSHR * T25.Z, T20.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T25.X, T20.W, literal.x,
; CM-NEXT: MOV T25.Y, 0.0,
; CM-NEXT: LSHR * T26.Z, T20.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T26.X, T20.Z, literal.x,
; CM-NEXT: MOV T26.Y, 0.0,
; CM-NEXT: LSHR * T20.Z, T22.Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T20.X, T22.Y, literal.x,
; CM-NEXT: MOV T20.Y, 0.0,
; CM-NEXT: LSHR * T27.Z, T22.X, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T27.X, T22.X, literal.x,
; CM-NEXT: MOV T27.Y, 0.0,
; CM-NEXT: LSHR * T28.Z, T22.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T28.X, T22.W, literal.x,
; CM-NEXT: MOV T28.Y, 0.0,
; CM-NEXT: LSHR * T29.Z, T22.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T29.X, T22.Z, literal.x,
; CM-NEXT: MOV T29.Y, 0.0,
; CM-NEXT: LSHR * T19.Z, T21.Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: ALU clause starting at 65:
; CM-NEXT: AND_INT T19.X, T21.Y, literal.x,
; CM-NEXT: MOV T19.Y, 0.0,
; CM-NEXT: LSHR * T30.Z, T21.X, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T30.X, T21.X, literal.x,
; CM-NEXT: MOV T30.Y, 0.0,
; CM-NEXT: LSHR * T31.Z, T21.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T31.X, T21.W, literal.x,
; CM-NEXT: MOV T31.Y, 0.0,
; CM-NEXT: LSHR * T32.Z, T21.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T32.X, T21.Z, literal.x,
; CM-NEXT: MOV T32.Y, 0.0,
; CM-NEXT: LSHR * T21.Z, T22.Y, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T21.X, T22.Y, literal.x,
; CM-NEXT: MOV T21.Y, 0.0,
; CM-NEXT: LSHR * T33.Z, T22.X, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T33.X, T22.X, literal.x,
; CM-NEXT: MOV T33.Y, 0.0,
; CM-NEXT: LSHR * T34.Z, T22.W, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T34.X, T22.W, literal.x,
; CM-NEXT: MOV T34.Y, 0.0,
; CM-NEXT: LSHR * T35.Z, T22.Z, literal.y,
; CM-NEXT: 65535(9.183409e-41), 16(2.242078e-44)
; CM-NEXT: AND_INT T35.X, T22.Z, literal.x,
; CM-NEXT: MOV T35.Y, 0.0,
; CM-NEXT: MOV * T23.W, 0.0,
; CM-NEXT: 65535(9.183409e-41), 0(0.000000e+00)
; CM-NEXT: MOV * T24.W, 0.0,
; CM-NEXT: MOV * T25.W, 0.0,
; CM-NEXT: MOV * T26.W, 0.0,
; CM-NEXT: MOV * T20.W, 0.0,
; CM-NEXT: MOV * T27.W, 0.0,
; CM-NEXT: MOV * T28.W, 0.0,
; CM-NEXT: MOV * T29.W, 0.0,
; CM-NEXT: MOV * T19.W, 0.0,
; CM-NEXT: MOV * T30.W, 0.0,
; CM-NEXT: MOV * T31.W, 0.0,
; CM-NEXT: MOV * T32.W, 0.0,
; CM-NEXT: MOV * T21.W, 0.0,
; CM-NEXT: MOV * T33.W, 0.0,
; CM-NEXT: MOV * T34.W, 0.0,
; CM-NEXT: MOV * T35.W, 0.0,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 224(3.138909e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T22.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 240(3.363116e-43)
; CM-NEXT: LSHR T36.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 192(2.690493e-43)
; CM-NEXT: LSHR T37.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 208(2.914701e-43)
; CM-NEXT: LSHR T38.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 160(2.242078e-43)
; CM-NEXT: LSHR T39.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 176(2.466285e-43)
; CM-NEXT: LSHR T40.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 128(1.793662e-43)
; CM-NEXT: LSHR T41.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 144(2.017870e-43)
; CM-NEXT: LSHR T42.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 96(1.345247e-43)
; CM-NEXT: LSHR T43.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 112(1.569454e-43)
; CM-NEXT: LSHR T44.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 64(8.968310e-44)
; CM-NEXT: LSHR T45.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; CM-NEXT: LSHR T46.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; CM-NEXT: LSHR T47.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; CM-NEXT: LSHR * T48.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; CM-NEXT: LSHR T49.X, KC0[2].Y, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: LSHR * T50.X, PV.W, literal.x,
; CM-NEXT: 2(2.802597e-45), 0(0.000000e+00)
%load = load <32 x i16>, <32 x i16> addrspace(1)* %in
%ext = zext <32 x i16> %load to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
; GCN-NOHSA-SI-LABEL: global_sextload_v32i16_to_v32i64:
; GCN-NOHSA-SI: ; %bb.0:
; GCN-NOHSA-SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GCN-NOHSA-SI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-SI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-SI-NEXT: s_mov_b32 s10, s2
; GCN-NOHSA-SI-NEXT: s_mov_b32 s11, s3
; GCN-NOHSA-SI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-SI-NEXT: s_mov_b32 s8, s6
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[8:11], 0
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:16
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(3)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v3
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v16, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[2:3], 48
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:240
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[0:1], 48
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:208
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v7
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[6:7], 48
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:176
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[4:5], 48
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v5, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:144
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(4)
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v15
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v16, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[18:19], v[14:15], 48
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:112
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[12:13], 48
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v13, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v11
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[10:11], 48
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_ashr_i64 v[17:18], v[8:9], 48
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v9, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:16
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v2, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:224
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v5, 16, v6
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v12
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:192
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v7, 16, v10
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v6, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:160
; GCN-NOHSA-SI-NEXT: s_waitcnt expcnt(0)
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v0, 16, v8
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v2, v0, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v0, v8, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v5, v10, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v7, v7, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v11, v9, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v9, v12, 0, 16
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v14
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v13, v14, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v15, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v17, v4, 0, 16
; GCN-NOHSA-SI-NEXT: v_bfe_i32 v19, v1, 0, 16
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v10, 31, v9
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v18, 31, v17
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v8, 31, v7
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v16, 31, v15
; GCN-NOHSA-SI-NEXT: v_ashrrev_i32_e32 v20, 31, v19
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:128
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
; GCN-NOHSA-SI-NEXT: s_endpgm
;
; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 48
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
; GCN-HSA-NEXT: s_add_u32 s4, s2, 32
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
; GCN-HSA-NEXT: s_add_u32 s2, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4
; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9]
; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 48
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: s_add_u32 s4, s0, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s4
; GCN-HSA-NEXT: s_waitcnt vmcnt(3)
; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16
; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[0:1], 48
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xd0
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xb0
; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0
; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x90
; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0
; GCN-HSA-NEXT: s_add_u32 s10, s0, 0x70
; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0
; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x50
; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3
; GCN-HSA-NEXT: v_bfe_i32 v16, v1, 0, 16
; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0
; GCN-HSA-NEXT: v_ashr_i64 v[18:19], v[2:3], 48
; GCN-HSA-NEXT: s_add_u32 s14, s0, 32
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v2
; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19]
; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0
; GCN-HSA-NEXT: v_bfe_i32 v18, v1, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v16, v2, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s14
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18
; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15
; GCN-HSA-NEXT: flat_store_dwordx4 v[1:2], v[16:19]
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1
; GCN-HSA-NEXT: v_bfe_i32 v2, v1, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5
; GCN-HSA-NEXT: s_waitcnt vmcnt(6)
; GCN-HSA-NEXT: v_bfe_i32 v0, v5, 0, 16
; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[4:5], 48
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v0, v7
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[6:7], 48
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s9
; GCN-HSA-NEXT: s_waitcnt vmcnt(7)
; GCN-HSA-NEXT: v_bfe_i32 v0, v9, 0, 16
; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[8:9], 48
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s8
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7
; GCN-HSA-NEXT: v_mov_b32_e32 v0, v11
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[10:11], 48
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s13
; GCN-HSA-NEXT: s_waitcnt vmcnt(8)
; GCN-HSA-NEXT: v_bfe_i32 v0, v13, 0, 16
; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[12:13], 48
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s12
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3]
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, v15
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s11
; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[14:15], 48
; GCN-HSA-NEXT: v_mov_b32_e32 v18, s10
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v6
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[0:3]
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v4
; GCN-HSA-NEXT: v_bfe_i32 v0, v6, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v6, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[0:3]
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v10
; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v14
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v12
; GCN-HSA-NEXT: v_bfe_i32 v0, v12, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v4, v14, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v14, v15, 0, 16
; GCN-HSA-NEXT: v_bfe_i32 v12, v10, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v8
; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15]
; GCN-HSA-NEXT: v_bfe_i32 v10, v11, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3
; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2
; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60
; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-HSA-NEXT: s_add_u32 s0, s0, 64
; GCN-HSA-NEXT: v_bfe_i32 v6, v6, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11]
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16
; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7]
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-HSA-NEXT: s_endpgm
;
; GCN-NOHSA-VI-LABEL: global_sextload_v32i16_to_v32i64:
; GCN-NOHSA-VI: ; %bb.0:
; GCN-NOHSA-VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GCN-NOHSA-VI-NEXT: s_mov_b32 s3, 0xf000
; GCN-NOHSA-VI-NEXT: s_mov_b32 s2, -1
; GCN-NOHSA-VI-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NOHSA-VI-NEXT: s_mov_b32 s0, s4
; GCN-NOHSA-VI-NEXT: s_mov_b32 s1, s5
; GCN-NOHSA-VI-NEXT: s_mov_b32 s4, s6
; GCN-NOHSA-VI-NEXT: s_mov_b32 s5, s7
; GCN-NOHSA-VI-NEXT: s_mov_b32 s6, s2
; GCN-NOHSA-VI-NEXT: s_mov_b32 s7, s3
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
; GCN-NOHSA-VI-NEXT: s_waitcnt vmcnt(0)
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v16, 16, v14
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v16, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v14, 0, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v14, v15
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v15, 16, v15
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:224
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v14, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v15, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:240
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v18, v5, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v12
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v14, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v12, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:192
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v13, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v14, 16, v13
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v14, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:208
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v16, v11
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v12, 16, v10
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v17, 16, v11
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v10, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v16, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v12, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v17, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:160
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:176
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v10, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v8
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v11, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v8, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v8, 16, v9
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:128
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v9, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v8, 0, 16
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v8, v7
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v7, 16, v7
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:144
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v9, 16, v6
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v11, v6, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v8, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v7, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v13, v9, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:112
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v12, 31, v11
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v6, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v4, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v14, 31, v13
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:96
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:64
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v11, 16, v0
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v6, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v0, 16, v2
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v16, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v0, 16, v5
; GCN-NOHSA-VI-NEXT: v_mov_b32_e32 v4, v3
; GCN-NOHSA-VI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v20, v0, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v12, v1, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v8, v11, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v14, v2, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v1, v4, 0, 16
; GCN-NOHSA-VI-NEXT: v_bfe_i32 v3, v3, 0, 16
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v19, 31, v18
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v21, 31, v20
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v11, 31, v10
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v13, 31, v12
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v7, 31, v6
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v9, 31, v8
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v15, 31, v14
; GCN-NOHSA-VI-NEXT: v_ashrrev_i32_e32 v17, 31, v16
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:80
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[14:17], off, s[0:3], 0 offset:32
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0 offset:48
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[6:9], off, s[0:3], 0
; GCN-NOHSA-VI-NEXT: buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:16
; GCN-NOHSA-VI-NEXT: s_endpgm
;
; EG-LABEL: global_sextload_v32i16_to_v32i64:
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @30, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @22
; EG-NEXT: ALU 56, @31, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 2 @24
; EG-NEXT: ALU 74, @88, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T50.XYZW, T38.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T49.XYZW, T36.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T48.XYZW, T34.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T39.XYZW, T33.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T47.XYZW, T32.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T46.XYZW, T31.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T45.XYZW, T30.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T40.XYZW, T29.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T44.XYZW, T28.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T43.XYZW, T27.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T42.XYZW, T26.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T25.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T41.XYZW, T24.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T19.XYZW, T23.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T37.XYZW, T22.X, 0
; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T35.XYZW, T21.X, 1
; EG-NEXT: CF_END
; EG-NEXT: Fetch clause starting at 22:
; EG-NEXT: VTX_READ_128 T20.XYZW, T19.X, 0, #1
; EG-NEXT: Fetch clause starting at 24:
; EG-NEXT: VTX_READ_128 T38.XYZW, T19.X, 48, #1
; EG-NEXT: VTX_READ_128 T39.XYZW, T19.X, 32, #1
; EG-NEXT: VTX_READ_128 T40.XYZW, T19.X, 16, #1
; EG-NEXT: ALU clause starting at 30:
; EG-NEXT: MOV * T19.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 31:
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T21.X, PV.W, literal.x,
; EG-NEXT: LSHR * T22.X, KC0[2].Y, literal.x,
; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00)
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; EG-NEXT: 48(6.726233e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T23.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; EG-NEXT: LSHR T24.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; EG-NEXT: LSHR T25.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 64(8.968310e-44)
; EG-NEXT: LSHR T26.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 112(1.569454e-43)
; EG-NEXT: LSHR T27.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 96(1.345247e-43)
; EG-NEXT: LSHR T28.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 144(2.017870e-43)
; EG-NEXT: LSHR T29.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43)
; EG-NEXT: LSHR T30.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 176(2.466285e-43)
; EG-NEXT: LSHR T31.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 160(2.242078e-43)
; EG-NEXT: LSHR T32.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 208(2.914701e-43)
; EG-NEXT: LSHR T33.X, PV.W, literal.x,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; EG-NEXT: 2(2.802597e-45), 192(2.690493e-43)
; EG-NEXT: LSHR T34.X, PV.W, literal.x,
; EG-NEXT: ADD_INT T0.W, KC0[2].Y, literal.y,
; EG-NEXT: ASHR * T35.W, T20.Y, literal.z,
; EG-NEXT: 2(2.802597e-45), 240(3.363116e-43)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: LSHR T36.X, PV.W, literal.x,
; EG-NEXT: ASHR T35.Z, T20.Y, literal.y,
; EG-NEXT: ASHR * T37.W, T20.X, literal.z,
; EG-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T35.X, T20.Y, 0.0, literal.x,
; EG-NEXT: ASHR * T37.Z, T20.X, literal.x,
; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; EG-NEXT: BFE_INT T37.X, T20.X, 0.0, literal.x,
; EG-NEXT: ASHR T35.Y, PV.X, literal.y,
; EG-NEXT: ASHR * T19.W, T20.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: ALU clause starting at 88:
; EG-NEXT: ASHR T19.Z, T20.W, literal.x,
; EG-NEXT: ASHR * T41.W, T20.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T19.X, T20.W, 0.0, literal.x,
; EG-NEXT: ASHR T37.Y, T37.X, literal.y,
; EG-NEXT: ASHR T41.Z, T20.Z, literal.x,
; EG-NEXT: ASHR * T20.W, T40.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T41.X, T20.Z, 0.0, literal.x,
; EG-NEXT: ASHR T19.Y, PV.X, literal.y,
; EG-NEXT: ASHR T20.Z, T40.Y, literal.x,
; EG-NEXT: ASHR * T42.W, T40.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T20.X, T40.Y, 0.0, literal.x,
; EG-NEXT: ASHR T41.Y, PV.X, literal.y,
; EG-NEXT: ASHR T42.Z, T40.X, literal.x,
; EG-NEXT: ASHR * T43.W, T40.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T42.X, T40.X, 0.0, literal.x,
; EG-NEXT: ASHR T20.Y, PV.X, literal.y,
; EG-NEXT: ASHR T43.Z, T40.W, literal.x,
; EG-NEXT: ASHR * T44.W, T40.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T43.X, T40.W, 0.0, literal.x,
; EG-NEXT: ASHR T42.Y, PV.X, literal.y,
; EG-NEXT: ASHR T44.Z, T40.Z, literal.x,
; EG-NEXT: ASHR * T40.W, T39.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T44.X, T40.Z, 0.0, literal.x,
; EG-NEXT: ASHR T43.Y, PV.X, literal.y,
; EG-NEXT: ASHR T40.Z, T39.Y, literal.x,
; EG-NEXT: ASHR * T45.W, T39.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T40.X, T39.Y, 0.0, literal.x,
; EG-NEXT: ASHR T44.Y, PV.X, literal.y,
; EG-NEXT: ASHR T45.Z, T39.X, literal.x,
; EG-NEXT: ASHR * T46.W, T39.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T45.X, T39.X, 0.0, literal.x,
; EG-NEXT: ASHR T40.Y, PV.X, literal.y,
; EG-NEXT: ASHR T46.Z, T39.W, literal.x,
; EG-NEXT: ASHR * T47.W, T39.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T46.X, T39.W, 0.0, literal.x,
; EG-NEXT: ASHR T45.Y, PV.X, literal.y,
; EG-NEXT: ASHR T47.Z, T39.Z, literal.x,
; EG-NEXT: ASHR * T39.W, T38.Y, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T47.X, T39.Z, 0.0, literal.x,
; EG-NEXT: ASHR T46.Y, PV.X, literal.y,
; EG-NEXT: ASHR T39.Z, T38.Y, literal.x,
; EG-NEXT: ASHR * T48.W, T38.X, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T39.X, T38.Y, 0.0, literal.x,
; EG-NEXT: ASHR T47.Y, PV.X, literal.y,
; EG-NEXT: ASHR T48.Z, T38.X, literal.x,
; EG-NEXT: ASHR * T49.W, T38.W, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T48.X, T38.X, 0.0, literal.x,
; EG-NEXT: ASHR T39.Y, PV.X, literal.y,
; EG-NEXT: ASHR T49.Z, T38.W, literal.x,
; EG-NEXT: ASHR * T50.W, T38.Z, literal.y,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T49.X, T38.W, 0.0, literal.x,
; EG-NEXT: ASHR T48.Y, PV.X, literal.y,
; EG-NEXT: ASHR * T50.Z, T38.Z, literal.x,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: BFE_INT T50.X, T38.Z, 0.0, literal.x,
; EG-NEXT: ASHR T49.Y, PV.X, literal.y,
; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.z,
; EG-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; EG-NEXT: 224(3.138909e-43), 0(0.000000e+00)
; EG-NEXT: LSHR T38.X, PV.W, literal.x,
; EG-NEXT: ASHR * T50.Y, PV.X, literal.y,
; EG-NEXT: 2(2.802597e-45), 31(4.344025e-44)
;
; CM-LABEL: global_sextload_v32i16_to_v32i64:
; CM: ; %bb.0:
; CM-NEXT: ALU 0, @30, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 0 @22
; CM-NEXT: ALU 55, @31, KC0[CB0:0-32], KC1[]
; CM-NEXT: TEX 2 @24
; CM-NEXT: ALU 73, @87, KC0[CB0:0-32], KC1[]
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T38, T50.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T49, T36.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T48, T34.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T47, T33.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T39, T32.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T46, T31.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T45, T30.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T44, T29.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T40, T28.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T43, T27.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T42, T26.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T41, T25.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T20, T24.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T19, T23.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T37, T22.X
; CM-NEXT: MEM_RAT_CACHELESS STORE_DWORD T35, T21.X
; CM-NEXT: CF_END
; CM-NEXT: Fetch clause starting at 22:
; CM-NEXT: VTX_READ_128 T20.XYZW, T19.X, 48, #1
; CM-NEXT: Fetch clause starting at 24:
; CM-NEXT: VTX_READ_128 T38.XYZW, T19.X, 0, #1
; CM-NEXT: VTX_READ_128 T39.XYZW, T19.X, 16, #1
; CM-NEXT: VTX_READ_128 T40.XYZW, T19.X, 32, #1
; CM-NEXT: ALU clause starting at 30:
; CM-NEXT: MOV * T19.X, KC0[2].Z,
; CM-NEXT: ALU clause starting at 31:
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 224(3.138909e-43), 0(0.000000e+00)
; CM-NEXT: LSHR T21.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 240(3.363116e-43)
; CM-NEXT: LSHR T22.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 192(2.690493e-43)
; CM-NEXT: LSHR T23.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 208(2.914701e-43)
; CM-NEXT: LSHR T24.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 160(2.242078e-43)
; CM-NEXT: LSHR T25.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 176(2.466285e-43)
; CM-NEXT: LSHR T26.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 128(1.793662e-43)
; CM-NEXT: LSHR T27.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 144(2.017870e-43)
; CM-NEXT: LSHR T28.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 96(1.345247e-43)
; CM-NEXT: LSHR T29.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 112(1.569454e-43)
; CM-NEXT: LSHR T30.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 64(8.968310e-44)
; CM-NEXT: LSHR T31.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 80(1.121039e-43)
; CM-NEXT: LSHR T32.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 32(4.484155e-44)
; CM-NEXT: LSHR T33.X, PV.W, literal.x,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y,
; CM-NEXT: 2(2.802597e-45), 48(6.726233e-44)
; CM-NEXT: LSHR T34.X, PV.W, literal.x,
; CM-NEXT: ASHR * T35.W, T20.Z, literal.y,
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
; CM-NEXT: LSHR T36.X, KC0[2].Y, literal.x,
; CM-NEXT: ASHR T35.Z, T20.Z, literal.y,
; CM-NEXT: ASHR * T37.W, T20.W, literal.z,
; CM-NEXT: 2(2.802597e-45), 16(2.242078e-44)
; CM-NEXT: 31(4.344025e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T35.X, T20.Z, 0.0, literal.x,
; CM-NEXT: ASHR * T37.Z, T20.W, literal.x,
; CM-NEXT: 16(2.242078e-44), 0(0.000000e+00)
; CM-NEXT: BFE_INT T37.X, T20.W, 0.0, literal.x,
; CM-NEXT: ASHR T35.Y, PV.X, literal.y,
; CM-NEXT: ASHR * T19.W, T20.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: ALU clause starting at 87:
; CM-NEXT: ASHR T19.Z, T20.X, literal.x,
; CM-NEXT: ASHR * T20.W, T20.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T19.X, T20.X, 0.0, literal.x,
; CM-NEXT: ASHR T37.Y, T37.X, literal.y, BS:VEC_120/SCL_212
; CM-NEXT: ASHR T20.Z, T20.Y, literal.x,
; CM-NEXT: ASHR * T41.W, T40.Z, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T20.X, T20.Y, 0.0, literal.x,
; CM-NEXT: ASHR T19.Y, PV.X, literal.y,
; CM-NEXT: ASHR T41.Z, T40.Z, literal.x,
; CM-NEXT: ASHR * T42.W, T40.W, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T41.X, T40.Z, 0.0, literal.x,
; CM-NEXT: ASHR T20.Y, PV.X, literal.y,
; CM-NEXT: ASHR T42.Z, T40.W, literal.x,
; CM-NEXT: ASHR * T43.W, T40.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T42.X, T40.W, 0.0, literal.x,
; CM-NEXT: ASHR T41.Y, PV.X, literal.y,
; CM-NEXT: ASHR T43.Z, T40.X, literal.x,
; CM-NEXT: ASHR * T40.W, T40.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T43.X, T40.X, 0.0, literal.x,
; CM-NEXT: ASHR T42.Y, PV.X, literal.y,
; CM-NEXT: ASHR T40.Z, T40.Y, literal.x,
; CM-NEXT: ASHR * T44.W, T39.Z, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T40.X, T40.Y, 0.0, literal.x,
; CM-NEXT: ASHR T43.Y, PV.X, literal.y,
; CM-NEXT: ASHR T44.Z, T39.Z, literal.x,
; CM-NEXT: ASHR * T45.W, T39.W, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T44.X, T39.Z, 0.0, literal.x,
; CM-NEXT: ASHR T40.Y, PV.X, literal.y,
; CM-NEXT: ASHR T45.Z, T39.W, literal.x,
; CM-NEXT: ASHR * T46.W, T39.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T45.X, T39.W, 0.0, literal.x,
; CM-NEXT: ASHR T44.Y, PV.X, literal.y,
; CM-NEXT: ASHR T46.Z, T39.X, literal.x,
; CM-NEXT: ASHR * T39.W, T39.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T46.X, T39.X, 0.0, literal.x,
; CM-NEXT: ASHR T45.Y, PV.X, literal.y,
; CM-NEXT: ASHR T39.Z, T39.Y, literal.x,
; CM-NEXT: ASHR * T47.W, T38.Z, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T39.X, T39.Y, 0.0, literal.x,
; CM-NEXT: ASHR T46.Y, PV.X, literal.y,
; CM-NEXT: ASHR T47.Z, T38.Z, literal.x,
; CM-NEXT: ASHR * T48.W, T38.W, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T47.X, T38.Z, 0.0, literal.x,
; CM-NEXT: ASHR T39.Y, PV.X, literal.y,
; CM-NEXT: ASHR T48.Z, T38.W, literal.x,
; CM-NEXT: ASHR * T49.W, T38.X, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T48.X, T38.W, 0.0, literal.x,
; CM-NEXT: ASHR T47.Y, PV.X, literal.y,
; CM-NEXT: ASHR T49.Z, T38.X, literal.x,
; CM-NEXT: ASHR * T38.W, T38.Y, literal.y,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T49.X, T38.X, 0.0, literal.x,
; CM-NEXT: ASHR T48.Y, PV.X, literal.y,
; CM-NEXT: ASHR * T38.Z, T38.Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: BFE_INT T38.X, T38.Y, 0.0, literal.x,
; CM-NEXT: ASHR T49.Y, PV.X, literal.y,
; CM-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x,
; CM-NEXT: 16(2.242078e-44), 31(4.344025e-44)
; CM-NEXT: LSHR T50.X, PV.W, literal.x,
; CM-NEXT: ASHR * T38.Y, PV.X, literal.y,
; CM-NEXT: 2(2.802597e-45), 31(4.344025e-44)
%load = load <32 x i16>, <32 x i16> addrspace(1)* %in
%ext = sext <32 x i16> %load to <32 x i64>
store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
ret void
}
; define amdgpu_kernel void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
; %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
; %ext = zext <64 x i16> %load to <64 x i64>
; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
; ret void
; }
; define amdgpu_kernel void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
; %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
; %ext = sext <64 x i16> %load to <64 x i64>
; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
; ret void
; }
attributes #0 = { nounwind }