1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 19:23:23 +01:00

[AMDGPU] Avoid sorting stalls in regbank-reassign

This is the slowest operation in the already slow pass.
Instead of sorting just put a stall list into an ordered
map.

Differential Revision: https://reviews.llvm.org/D86253
This commit is contained in:
Stanislav Mekhanoshin 2020-08-20 16:22:59 -07:00
parent facbe0803e
commit f6faf01d47
8 changed files with 189 additions and 175 deletions

View File

@ -84,18 +84,15 @@ class GCNRegBankReassign : public MachineFunctionPass {
class Candidate {
public:
Candidate(MachineInstr *mi, Register reg, unsigned subreg,
unsigned freebanks, unsigned weight)
: MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks),
Weight(weight) {}
bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
unsigned freebanks)
: MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump(const GCNRegBankReassign *P) const {
MI->dump();
dbgs() << P->printReg(Reg) << " to banks ";
dumpFreeBanks(FreeBanks);
dbgs() << " weight " << Weight << '\n';
dbgs() << '\n';
}
#endif
@ -103,16 +100,35 @@ class GCNRegBankReassign : public MachineFunctionPass {
Register Reg;
unsigned SubReg;
unsigned FreeBanks;
unsigned Weight;
};
class CandidateList : public std::list<Candidate> {
class CandidateList : public std::map<unsigned, std::list<Candidate>> {
public:
// Speedup subsequent sort.
void push(const Candidate&& C) {
if (C.Weight) push_back(C);
else push_front(C);
void push(unsigned Weight, const Candidate&& C) {
operator[](Weight).push_front(C);
}
Candidate &back() {
return rbegin()->second.back();
}
void pop_back() {
rbegin()->second.pop_back();
if (rbegin()->second.empty())
erase(rbegin()->first);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void dump(const GCNRegBankReassign *P) const {
dbgs() << "\nCandidates:\n\n";
for (auto &B : *this) {
dbgs() << " Weight " << B.first << ":\n";
for (auto &C : B.second)
C.dump(P);
}
dbgs() << "\n\n";
}
#endif
};
public:
@ -601,11 +617,11 @@ void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
if (FreeBanks1)
Candidates.push(Candidate(&MI, Reg1, SubReg1, FreeBanks1,
Weight + ((Size2 > Size1) ? 1 : 0)));
Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0),
Candidate(&MI, Reg1, SubReg1, FreeBanks1));
if (FreeBanks2)
Candidates.push(Candidate(&MI, Reg2, SubReg2, FreeBanks2,
Weight + ((Size1 > Size2) ? 1 : 0)));
Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0),
Candidate(&MI, Reg2, SubReg2, FreeBanks2));
}
}
}
@ -761,9 +777,15 @@ unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
}
void GCNRegBankReassign::removeCandidates(Register Reg) {
Candidates.remove_if([Reg, this](const Candidate& C) {
return C.MI->readsRegister(Reg, TRI);
});
typename CandidateList::iterator Next;
for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) {
Next = std::next(I);
I->second.remove_if([Reg, this](const Candidate& C) {
return C.MI->readsRegister(Reg, TRI);
});
if (I->second.empty())
Candidates.erase(I);
}
}
bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
@ -808,11 +830,7 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
"function " << MF.getName() << '\n');
Candidates.sort();
LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
for (auto C : Candidates) C.dump(this);
dbgs() << "\n\n");
LLVM_DEBUG(Candidates.dump(this));
unsigned CyclesSaved = 0;
while (!Candidates.empty()) {
@ -827,12 +845,8 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
if (LocalCyclesSaved) {
removeCandidates(C.Reg);
computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true);
Candidates.sort();
LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
for (auto C : Candidates)
C.dump(this);
dbgs() << "\n\n");
LLVM_DEBUG(Candidates.dump(this));
}
}
NumStallsRecovered += CyclesSaved;

View File

@ -21,12 +21,12 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
; GFX10-LABEL: sample_d_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_or_b32 v3, v2, v6, v3
; GFX10-NEXT: v_and_or_b32 v10, v0, v6, v1
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v10, v3, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1
; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -77,9 +77,9 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2
; GFX10-NEXT: v_and_or_b32 v3, v3, v7, v4
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -107,12 +107,12 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-LABEL: sample_d_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_or_b32 v3, v2, v7, v9
; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -173,12 +173,12 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
; GFX10-LABEL: sample_cd_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_or_b32 v3, v2, v6, v3
; GFX10-NEXT: v_and_or_b32 v10, v0, v6, v1
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v10, v3, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1
; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -209,9 +209,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2
; GFX10-NEXT: v_and_or_b32 v3, v3, v7, v4
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2
; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -239,12 +239,12 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10-LABEL: sample_cd_cl_2d:
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_or_b32 v3, v2, v7, v9
; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1
; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:

View File

@ -2983,19 +2983,19 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_mov_b32_e32 v10, v4
; GFX10-NEXT: v_mov_b32_e32 v11, v5
; GFX10-NEXT: v_mov_b32_e32 v8, v6
; GFX10-NEXT: v_mov_b32_e32 v9, v7
; GFX10-NEXT: v_mov_b32_e32 v15, v6
; GFX10-NEXT: v_mov_b32_e32 v16, v7
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v10
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v11, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v8
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v9, vcc_lo
; GFX10-NEXT: v_add_co_u32_e64 v5, vcc_lo, v2, v15
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v3, v16, vcc_lo
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[10:11]
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[8:9]
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[5:6], v[15:16]
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, -1, s4
; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, -1, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
%result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
ret <2 x i64> %result

View File

@ -511,11 +511,11 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB2_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7
; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@ -563,11 +563,11 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB2_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7
; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@ -750,11 +750,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB3_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7
; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@ -802,11 +802,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB3_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7
; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@ -989,11 +989,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB4_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7
; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@ -1041,11 +1041,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB4_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7
; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@ -2064,11 +2064,11 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB10_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v7
; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@ -2116,11 +2116,11 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB10_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v7
; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@ -2799,11 +2799,11 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB14_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v7
; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@ -2850,11 +2850,11 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB14_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v7
; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@ -3037,11 +3037,11 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB15_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v7
; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@ -3089,11 +3089,11 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB15_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v7
; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@ -3276,11 +3276,11 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB16_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v7
; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@ -3328,11 +3328,11 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB16_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v7
; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@ -3512,11 +3512,11 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB17_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v7
; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@ -3563,11 +3563,11 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB17_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v7
; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@ -3932,11 +3932,11 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB19_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v7
; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@ -3983,11 +3983,11 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB19_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v7
; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@ -4355,11 +4355,11 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB21_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v7
; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@ -4407,11 +4407,11 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB21_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v7
; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv
@ -4773,11 +4773,11 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX1064-NEXT: s_cbranch_execz BB23_2
; GFX1064-NEXT: ; %bb.1:
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v7
; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1064-NEXT: buffer_gl0_inv
; GFX1064-NEXT: buffer_gl1_inv
@ -4824,11 +4824,11 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1032-NEXT: s_cbranch_execz BB23_2
; GFX1032-NEXT: ; %bb.1:
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v7
; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1032-NEXT: buffer_gl0_inv
; GFX1032-NEXT: buffer_gl1_inv

View File

@ -676,13 +676,13 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_e32 v5, v10, v5
; GFX10-NEXT: v_and_b32_e32 v3, v10, v3
; GFX10-NEXT: v_and_b32_e32 v1, v10, v1
; GFX10-NEXT: v_and_b32_e32 v5, v10, v5
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v6, v6, 16, v5
; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v1, v3, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -730,13 +730,13 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_and_b32_e32 v4, v7, v4
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX10-NEXT: image_sample_d_cl v[0:3], [v0, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
; GFX10-NEXT: image_sample_d_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -787,12 +787,12 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_e32 v5, v8, v5
; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
; GFX10-NEXT: v_and_b32_e32 v3, v8, v3
; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v6, v4, 16, v3
; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v1, v6, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -888,13 +888,13 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_e32 v5, v10, v5
; GFX10-NEXT: v_and_b32_e32 v3, v10, v3
; GFX10-NEXT: v_and_b32_e32 v1, v10, v1
; GFX10-NEXT: v_and_b32_e32 v5, v10, v5
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v6, v6, 16, v5
; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v1, v3, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -942,13 +942,13 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_and_b32_e32 v4, v7, v4
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX10-NEXT: image_sample_cd_cl v[0:3], [v0, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
; GFX10-NEXT: image_sample_cd_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -999,12 +999,12 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_e32 v5, v8, v5
; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
; GFX10-NEXT: v_and_b32_e32 v3, v8, v3
; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX10-NEXT: v_lshl_or_b32 v6, v4, 16, v3
; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v1, v6, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -1203,13 +1203,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_e32 v6, v9, v6
; GFX10-NEXT: v_and_b32_e32 v4, v9, v4
; GFX10-NEXT: v_and_b32_e32 v2, v9, v2
; GFX10-NEXT: v_and_b32_e32 v6, v9, v6
; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v7, v7, 16, v6
; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v2, v4, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -1238,13 +1238,13 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_e32 v6, v9, v6
; GFX10-NEXT: v_and_b32_e32 v4, v9, v4
; GFX10-NEXT: v_and_b32_e32 v2, v9, v2
; GFX10-NEXT: v_and_b32_e32 v6, v9, v6
; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v7, v7, 16, v6
; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v2, v4, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:

View File

@ -93,11 +93,11 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; encoding: [0x03,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x03,0x04,0x05,0x06]
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -209,11 +209,11 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; encoding: [0x03,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x03,0x04,0x05,0x06]
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
; GFX10-NEXT: ; return to shader part epilog
main_body:

View File

@ -93,11 +93,11 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:
@ -209,11 +209,11 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
; GFX10: ; %bb.0: ; %main_body
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: ; return to shader part epilog
main_body:

View File

@ -319,8 +319,8 @@ body: |
...
# GCN-LABEL: smem_bundle{{$}}
# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr15, 0, 0
# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr14, 0, 0
# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr15, 0, 0
---
name: smem_bundle
tracksRegLiveness: true