mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 19:23:23 +01:00
[AMDGPU] Avoid sorting stalls in regbank-reassign
This is the slowest operation in the already slow pass. Instead of sorting just put a stall list into an ordered map. Differential Revision: https://reviews.llvm.org/D86253
This commit is contained in:
parent
facbe0803e
commit
f6faf01d47
@ -84,18 +84,15 @@ class GCNRegBankReassign : public MachineFunctionPass {
|
||||
class Candidate {
|
||||
public:
|
||||
Candidate(MachineInstr *mi, Register reg, unsigned subreg,
|
||||
unsigned freebanks, unsigned weight)
|
||||
: MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks),
|
||||
Weight(weight) {}
|
||||
|
||||
bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
|
||||
unsigned freebanks)
|
||||
: MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {}
|
||||
|
||||
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
||||
void dump(const GCNRegBankReassign *P) const {
|
||||
MI->dump();
|
||||
dbgs() << P->printReg(Reg) << " to banks ";
|
||||
dumpFreeBanks(FreeBanks);
|
||||
dbgs() << " weight " << Weight << '\n';
|
||||
dbgs() << '\n';
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -103,16 +100,35 @@ class GCNRegBankReassign : public MachineFunctionPass {
|
||||
Register Reg;
|
||||
unsigned SubReg;
|
||||
unsigned FreeBanks;
|
||||
unsigned Weight;
|
||||
};
|
||||
|
||||
class CandidateList : public std::list<Candidate> {
|
||||
class CandidateList : public std::map<unsigned, std::list<Candidate>> {
|
||||
public:
|
||||
// Speedup subsequent sort.
|
||||
void push(const Candidate&& C) {
|
||||
if (C.Weight) push_back(C);
|
||||
else push_front(C);
|
||||
void push(unsigned Weight, const Candidate&& C) {
|
||||
operator[](Weight).push_front(C);
|
||||
}
|
||||
|
||||
Candidate &back() {
|
||||
return rbegin()->second.back();
|
||||
}
|
||||
|
||||
void pop_back() {
|
||||
rbegin()->second.pop_back();
|
||||
if (rbegin()->second.empty())
|
||||
erase(rbegin()->first);
|
||||
}
|
||||
|
||||
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
||||
void dump(const GCNRegBankReassign *P) const {
|
||||
dbgs() << "\nCandidates:\n\n";
|
||||
for (auto &B : *this) {
|
||||
dbgs() << " Weight " << B.first << ":\n";
|
||||
for (auto &C : B.second)
|
||||
C.dump(P);
|
||||
}
|
||||
dbgs() << "\n\n";
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
public:
|
||||
@ -601,11 +617,11 @@ void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
|
||||
unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
|
||||
unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
|
||||
if (FreeBanks1)
|
||||
Candidates.push(Candidate(&MI, Reg1, SubReg1, FreeBanks1,
|
||||
Weight + ((Size2 > Size1) ? 1 : 0)));
|
||||
Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0),
|
||||
Candidate(&MI, Reg1, SubReg1, FreeBanks1));
|
||||
if (FreeBanks2)
|
||||
Candidates.push(Candidate(&MI, Reg2, SubReg2, FreeBanks2,
|
||||
Weight + ((Size1 > Size2) ? 1 : 0)));
|
||||
Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0),
|
||||
Candidate(&MI, Reg2, SubReg2, FreeBanks2));
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -761,9 +777,15 @@ unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
|
||||
}
|
||||
|
||||
void GCNRegBankReassign::removeCandidates(Register Reg) {
|
||||
Candidates.remove_if([Reg, this](const Candidate& C) {
|
||||
return C.MI->readsRegister(Reg, TRI);
|
||||
});
|
||||
typename CandidateList::iterator Next;
|
||||
for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) {
|
||||
Next = std::next(I);
|
||||
I->second.remove_if([Reg, this](const Candidate& C) {
|
||||
return C.MI->readsRegister(Reg, TRI);
|
||||
});
|
||||
if (I->second.empty())
|
||||
Candidates.erase(I);
|
||||
}
|
||||
}
|
||||
|
||||
bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
|
||||
@ -808,11 +830,7 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
|
||||
LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
|
||||
"function " << MF.getName() << '\n');
|
||||
|
||||
Candidates.sort();
|
||||
|
||||
LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
|
||||
for (auto C : Candidates) C.dump(this);
|
||||
dbgs() << "\n\n");
|
||||
LLVM_DEBUG(Candidates.dump(this));
|
||||
|
||||
unsigned CyclesSaved = 0;
|
||||
while (!Candidates.empty()) {
|
||||
@ -827,12 +845,8 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
|
||||
if (LocalCyclesSaved) {
|
||||
removeCandidates(C.Reg);
|
||||
computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true);
|
||||
Candidates.sort();
|
||||
|
||||
LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
|
||||
for (auto C : Candidates)
|
||||
C.dump(this);
|
||||
dbgs() << "\n\n");
|
||||
LLVM_DEBUG(Candidates.dump(this));
|
||||
}
|
||||
}
|
||||
NumStallsRecovered += CyclesSaved;
|
||||
|
@ -21,12 +21,12 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg
|
||||
; GFX10-LABEL: sample_d_2d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_or_b32 v3, v2, v6, v3
|
||||
; GFX10-NEXT: v_and_or_b32 v10, v0, v6, v1
|
||||
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v10, v3, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1
|
||||
; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3
|
||||
; GFX10-NEXT: image_sample_d_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
@ -77,9 +77,9 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2
|
||||
; GFX10-NEXT: v_and_or_b32 v3, v3, v7, v4
|
||||
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2
|
||||
; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4
|
||||
; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
@ -107,12 +107,12 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
|
||||
; GFX10-LABEL: sample_d_cl_2d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_or_b32 v3, v2, v7, v9
|
||||
; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1
|
||||
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1
|
||||
; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9
|
||||
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
@ -173,12 +173,12 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre
|
||||
; GFX10-LABEL: sample_cd_2d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_or_b32 v3, v2, v6, v3
|
||||
; GFX10-NEXT: v_and_or_b32 v10, v0, v6, v1
|
||||
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v10, v3, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1
|
||||
; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3
|
||||
; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
@ -209,9 +209,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2
|
||||
; GFX10-NEXT: v_and_or_b32 v3, v3, v7, v4
|
||||
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2
|
||||
; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4
|
||||
; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
@ -239,12 +239,12 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
|
||||
; GFX10-LABEL: sample_cd_cl_2d:
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1
|
||||
; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_or_b32 v3, v2, v7, v9
|
||||
; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1
|
||||
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1
|
||||
; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9
|
||||
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
|
@ -2983,19 +2983,19 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_mov_b32_e32 v10, v4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v11, v5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, v6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, v7
|
||||
; GFX10-NEXT: v_mov_b32_e32 v15, v6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v16, v7
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v10
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v11, vcc_lo
|
||||
; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v8
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v9, vcc_lo
|
||||
; GFX10-NEXT: v_add_co_u32_e64 v5, vcc_lo, v2, v15
|
||||
; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v3, v16, vcc_lo
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[10:11]
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[8:9]
|
||||
; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[5:6], v[15:16]
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, -1, s4
|
||||
; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, -1, s4
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
|
||||
ret <2 x i64> %result
|
||||
|
@ -511,11 +511,11 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX1064-NEXT: s_cbranch_execz BB2_2
|
||||
; GFX1064-NEXT: ; %bb.1:
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7
|
||||
; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: buffer_gl0_inv
|
||||
; GFX1064-NEXT: buffer_gl1_inv
|
||||
@ -563,11 +563,11 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
|
||||
; GFX1032-NEXT: s_cbranch_execz BB2_2
|
||||
; GFX1032-NEXT: ; %bb.1:
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7
|
||||
; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: buffer_gl0_inv
|
||||
; GFX1032-NEXT: buffer_gl1_inv
|
||||
@ -750,11 +750,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
|
||||
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX1064-NEXT: s_cbranch_execz BB3_2
|
||||
; GFX1064-NEXT: ; %bb.1:
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7
|
||||
; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: buffer_gl0_inv
|
||||
; GFX1064-NEXT: buffer_gl1_inv
|
||||
@ -802,11 +802,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) {
|
||||
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
|
||||
; GFX1032-NEXT: s_cbranch_execz BB3_2
|
||||
; GFX1032-NEXT: ; %bb.1:
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7
|
||||
; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: buffer_gl0_inv
|
||||
; GFX1032-NEXT: buffer_gl1_inv
|
||||
@ -989,11 +989,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
|
||||
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX1064-NEXT: s_cbranch_execz BB4_2
|
||||
; GFX1064-NEXT: ; %bb.1:
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7
|
||||
; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: buffer_gl0_inv
|
||||
; GFX1064-NEXT: buffer_gl1_inv
|
||||
@ -1041,11 +1041,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) {
|
||||
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
|
||||
; GFX1032-NEXT: s_cbranch_execz BB4_2
|
||||
; GFX1032-NEXT: ; %bb.1:
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7
|
||||
; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: buffer_gl0_inv
|
||||
; GFX1032-NEXT: buffer_gl1_inv
|
||||
@ -2064,11 +2064,11 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX1064-NEXT: s_cbranch_execz BB10_2
|
||||
; GFX1064-NEXT: ; %bb.1:
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v7
|
||||
; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: buffer_gl0_inv
|
||||
; GFX1064-NEXT: buffer_gl1_inv
|
||||
@ -2116,11 +2116,11 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
|
||||
; GFX1032-NEXT: s_cbranch_execz BB10_2
|
||||
; GFX1032-NEXT: ; %bb.1:
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v7
|
||||
; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: buffer_gl0_inv
|
||||
; GFX1032-NEXT: buffer_gl1_inv
|
||||
@ -2799,11 +2799,11 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX1064-NEXT: s_cbranch_execz BB14_2
|
||||
; GFX1064-NEXT: ; %bb.1:
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v7
|
||||
; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: buffer_gl0_inv
|
||||
; GFX1064-NEXT: buffer_gl1_inv
|
||||
@ -2850,11 +2850,11 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
|
||||
; GFX1032-NEXT: s_cbranch_execz BB14_2
|
||||
; GFX1032-NEXT: ; %bb.1:
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v7
|
||||
; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: buffer_gl0_inv
|
||||
; GFX1032-NEXT: buffer_gl1_inv
|
||||
@ -3037,11 +3037,11 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX1064-NEXT: s_cbranch_execz BB15_2
|
||||
; GFX1064-NEXT: ; %bb.1:
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v7
|
||||
; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: buffer_gl0_inv
|
||||
; GFX1064-NEXT: buffer_gl1_inv
|
||||
@ -3089,11 +3089,11 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
|
||||
; GFX1032-NEXT: s_cbranch_execz BB15_2
|
||||
; GFX1032-NEXT: ; %bb.1:
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v7
|
||||
; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: buffer_gl0_inv
|
||||
; GFX1032-NEXT: buffer_gl1_inv
|
||||
@ -3276,11 +3276,11 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX1064-NEXT: s_cbranch_execz BB16_2
|
||||
; GFX1064-NEXT: ; %bb.1:
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v7
|
||||
; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: buffer_gl0_inv
|
||||
; GFX1064-NEXT: buffer_gl1_inv
|
||||
@ -3328,11 +3328,11 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
|
||||
; GFX1032-NEXT: s_cbranch_execz BB16_2
|
||||
; GFX1032-NEXT: ; %bb.1:
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v7
|
||||
; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: buffer_gl0_inv
|
||||
; GFX1032-NEXT: buffer_gl1_inv
|
||||
@ -3512,11 +3512,11 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX1064-NEXT: s_cbranch_execz BB17_2
|
||||
; GFX1064-NEXT: ; %bb.1:
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v7
|
||||
; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: buffer_gl0_inv
|
||||
; GFX1064-NEXT: buffer_gl1_inv
|
||||
@ -3563,11 +3563,11 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
|
||||
; GFX1032-NEXT: s_cbranch_execz BB17_2
|
||||
; GFX1032-NEXT: ; %bb.1:
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v7
|
||||
; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: buffer_gl0_inv
|
||||
; GFX1032-NEXT: buffer_gl1_inv
|
||||
@ -3932,11 +3932,11 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX1064-NEXT: s_cbranch_execz BB19_2
|
||||
; GFX1064-NEXT: ; %bb.1:
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v7
|
||||
; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: buffer_gl0_inv
|
||||
; GFX1064-NEXT: buffer_gl1_inv
|
||||
@ -3983,11 +3983,11 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
|
||||
; GFX1032-NEXT: s_cbranch_execz BB19_2
|
||||
; GFX1032-NEXT: ; %bb.1:
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v7
|
||||
; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: buffer_gl0_inv
|
||||
; GFX1032-NEXT: buffer_gl1_inv
|
||||
@ -4355,11 +4355,11 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX1064-NEXT: s_cbranch_execz BB21_2
|
||||
; GFX1064-NEXT: ; %bb.1:
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v7
|
||||
; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: buffer_gl0_inv
|
||||
; GFX1064-NEXT: buffer_gl1_inv
|
||||
@ -4407,11 +4407,11 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
|
||||
; GFX1032-NEXT: s_cbranch_execz BB21_2
|
||||
; GFX1032-NEXT: ; %bb.1:
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v7
|
||||
; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: buffer_gl0_inv
|
||||
; GFX1032-NEXT: buffer_gl1_inv
|
||||
@ -4773,11 +4773,11 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
|
||||
; GFX1064-NEXT: s_cbranch_execz BB23_2
|
||||
; GFX1064-NEXT: ; %bb.1:
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1064-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v7
|
||||
; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4
|
||||
; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1064-NEXT: buffer_gl0_inv
|
||||
; GFX1064-NEXT: buffer_gl1_inv
|
||||
@ -4824,11 +4824,11 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) {
|
||||
; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
|
||||
; GFX1032-NEXT: s_cbranch_execz BB23_2
|
||||
; GFX1032-NEXT: ; %bb.1:
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, s3
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo
|
||||
; GFX1032-NEXT: v_mov_b32_e32 v4, s3
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v7
|
||||
; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4
|
||||
; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
||||
; GFX1032-NEXT: buffer_gl0_inv
|
||||
; GFX1032-NEXT: buffer_gl1_inv
|
||||
|
@ -676,13 +676,13 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_b32_e32 v5, v10, v5
|
||||
; GFX10-NEXT: v_and_b32_e32 v3, v10, v3
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, v10, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v5, v10, v5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10-NEXT: v_lshl_or_b32 v6, v6, 16, v5
|
||||
; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v1, v3, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
|
||||
; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
@ -730,13 +730,13 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v4, v7, v4
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v4
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
|
||||
; GFX10-NEXT: image_sample_d_cl v[0:3], [v0, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
|
||||
; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
|
||||
; GFX10-NEXT: image_sample_d_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
@ -787,12 +787,12 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_b32_e32 v5, v8, v5
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v3, v8, v3
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
|
||||
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10-NEXT: v_lshl_or_b32 v6, v4, 16, v3
|
||||
; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v1, v6, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
|
||||
; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
@ -888,13 +888,13 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_b32_e32 v5, v10, v5
|
||||
; GFX10-NEXT: v_and_b32_e32 v3, v10, v3
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, v10, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v5, v10, v5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10-NEXT: v_lshl_or_b32 v6, v6, 16, v5
|
||||
; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v1, v3, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
|
||||
; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
@ -942,13 +942,13 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v4, v7, v4
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v4
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
|
||||
; GFX10-NEXT: image_sample_cd_cl v[0:3], [v0, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
|
||||
; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
|
||||
; GFX10-NEXT: image_sample_cd_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
@ -999,12 +999,12 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32>
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_b32_e32 v5, v8, v5
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
|
||||
; GFX10-NEXT: v_and_b32_e32 v3, v8, v3
|
||||
; GFX10-NEXT: v_and_b32_e32 v1, v8, v1
|
||||
; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5
|
||||
; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1
|
||||
; GFX10-NEXT: v_lshl_or_b32 v6, v4, 16, v3
|
||||
; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v1, v6, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1
|
||||
; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
@ -1203,13 +1203,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32>
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_b32_e32 v6, v9, v6
|
||||
; GFX10-NEXT: v_and_b32_e32 v4, v9, v4
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, v9, v2
|
||||
; GFX10-NEXT: v_and_b32_e32 v6, v9, v6
|
||||
; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6
|
||||
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v7, v7, 16, v6
|
||||
; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v2, v4, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
|
||||
; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
@ -1238,13 +1238,13 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_b32_e32 v6, v9, v6
|
||||
; GFX10-NEXT: v_and_b32_e32 v4, v9, v4
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, v9, v2
|
||||
; GFX10-NEXT: v_and_b32_e32 v6, v9, v6
|
||||
; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6
|
||||
; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v7, v7, 16, v6
|
||||
; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v2, v4, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
|
||||
; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
|
@ -93,11 +93,11 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; encoding: [0x03,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
|
||||
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x03,0x04,0x05,0x06]
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
|
||||
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
@ -209,11 +209,11 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00]
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36]
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; encoding: [0x03,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
|
||||
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x03,0x04,0x05,0x06]
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36]
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04]
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04]
|
||||
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf]
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
|
@ -93,11 +93,11 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
|
||||
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
|
||||
; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
@ -209,11 +209,11 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i
|
||||
; GFX10: ; %bb.0: ; %main_body
|
||||
; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff
|
||||
; GFX10-NEXT: ; implicit-def: $vcc_hi
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
|
||||
; GFX10-NEXT: v_and_b32_e32 v2, v7, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2
|
||||
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: v_and_b32_e32 v0, v7, v0
|
||||
; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2
|
||||
; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0
|
||||
; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: ; return to shader part epilog
|
||||
main_body:
|
||||
|
@ -319,8 +319,8 @@ body: |
|
||||
...
|
||||
|
||||
# GCN-LABEL: smem_bundle{{$}}
|
||||
# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr15, 0, 0
|
||||
# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr14, 0, 0
|
||||
# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr15, 0, 0
|
||||
---
|
||||
name: smem_bundle
|
||||
tracksRegLiveness: true
|
||||
|
Loading…
Reference in New Issue
Block a user