From f6faf01d47068ab2a56248ae4e5c82fa818f6317 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Thu, 20 Aug 2020 16:22:59 -0700 Subject: [PATCH] [AMDGPU] Avoid sorting stalls in regbank-reassign This is the slowest operation in the already slow pass. Instead of sorting just put a stall list into an ordered map. Differential Revision: https://reviews.llvm.org/D86253 --- lib/Target/AMDGPU/GCNRegBankReassign.cpp | 72 ++++++---- .../llvm.amdgcn.image.sample.g16.ll | 44 +++--- test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll | 14 +- .../atomic_optimizations_local_pointer.ll | 132 +++++++++--------- .../llvm.amdgcn.image.sample.a16.dim.ll | 68 ++++----- .../llvm.amdgcn.image.sample.g16.encode.ll | 16 +-- .../AMDGPU/llvm.amdgcn.image.sample.g16.ll | 16 +-- test/CodeGen/AMDGPU/regbank-reassign.mir | 2 +- 8 files changed, 189 insertions(+), 175 deletions(-) diff --git a/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/lib/Target/AMDGPU/GCNRegBankReassign.cpp index d66e26ce01c..1c940428273 100644 --- a/lib/Target/AMDGPU/GCNRegBankReassign.cpp +++ b/lib/Target/AMDGPU/GCNRegBankReassign.cpp @@ -84,18 +84,15 @@ class GCNRegBankReassign : public MachineFunctionPass { class Candidate { public: Candidate(MachineInstr *mi, Register reg, unsigned subreg, - unsigned freebanks, unsigned weight) - : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks), - Weight(weight) {} - - bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; } + unsigned freebanks) + : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks) {} #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void dump(const GCNRegBankReassign *P) const { MI->dump(); dbgs() << P->printReg(Reg) << " to banks "; dumpFreeBanks(FreeBanks); - dbgs() << " weight " << Weight << '\n'; + dbgs() << '\n'; } #endif @@ -103,16 +100,35 @@ class GCNRegBankReassign : public MachineFunctionPass { Register Reg; unsigned SubReg; unsigned FreeBanks; - unsigned Weight; }; - class CandidateList : public std::list { + class CandidateList : public std::map> { public: - // Speedup subsequent sort. - void push(const Candidate&& C) { - if (C.Weight) push_back(C); - else push_front(C); + void push(unsigned Weight, const Candidate&& C) { + operator[](Weight).push_front(C); } + + Candidate &back() { + return rbegin()->second.back(); + } + + void pop_back() { + rbegin()->second.pop_back(); + if (rbegin()->second.empty()) + erase(rbegin()->first); + } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump(const GCNRegBankReassign *P) const { + dbgs() << "\nCandidates:\n\n"; + for (auto &B : *this) { + dbgs() << " Weight " << B.first << ":\n"; + for (auto &C : B.second) + C.dump(P); + } + dbgs() << "\n\n"; + } +#endif }; public: @@ -601,11 +617,11 @@ void GCNRegBankReassign::collectCandidates(MachineInstr& MI, unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); if (FreeBanks1) - Candidates.push(Candidate(&MI, Reg1, SubReg1, FreeBanks1, - Weight + ((Size2 > Size1) ? 1 : 0))); + Candidates.push(Weight + ((Size2 > Size1) ? 1 : 0), + Candidate(&MI, Reg1, SubReg1, FreeBanks1)); if (FreeBanks2) - Candidates.push(Candidate(&MI, Reg2, SubReg2, FreeBanks2, - Weight + ((Size1 > Size2) ? 1 : 0))); + Candidates.push(Weight + ((Size1 > Size2) ? 1 : 0), + Candidate(&MI, Reg2, SubReg2, FreeBanks2)); } } } @@ -761,9 +777,15 @@ unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF, } void GCNRegBankReassign::removeCandidates(Register Reg) { - Candidates.remove_if([Reg, this](const Candidate& C) { - return C.MI->readsRegister(Reg, TRI); - }); + typename CandidateList::iterator Next; + for (auto I = Candidates.begin(), E = Candidates.end(); I != E; I = Next) { + Next = std::next(I); + I->second.remove_if([Reg, this](const Candidate& C) { + return C.MI->readsRegister(Reg, TRI); + }); + if (I->second.empty()) + Candidates.erase(I); + } } bool GCNRegBankReassign::verifyCycles(MachineFunction &MF, @@ -808,11 +830,7 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in " "function " << MF.getName() << '\n'); - Candidates.sort(); - - LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; - for (auto C : Candidates) C.dump(this); - dbgs() << "\n\n"); + LLVM_DEBUG(Candidates.dump(this)); unsigned CyclesSaved = 0; while (!Candidates.empty()) { @@ -827,12 +845,8 @@ bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) { if (LocalCyclesSaved) { removeCandidates(C.Reg); computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true); - Candidates.sort(); - LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; - for (auto C : Candidates) - C.dump(this); - dbgs() << "\n\n"); + LLVM_DEBUG(Candidates.dump(this)); } } NumStallsRecovered += CyclesSaved; diff --git a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll index caba19977a4..d1c49af8b59 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -21,12 +21,12 @@ define amdgpu_ps <4 x float> @sample_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg ; GFX10-LABEL: sample_d_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_or_b32 v3, v2, v6, v3 -; GFX10-NEXT: v_and_or_b32 v10, v0, v6, v1 -; GFX10-NEXT: image_sample_d_g16 v[0:3], [v10, v3, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1 +; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3 +; GFX10-NEXT: image_sample_d_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -77,9 +77,9 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2 -; GFX10-NEXT: v_and_or_b32 v3, v3, v7, v4 -; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2 +; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10-NEXT: image_sample_c_d_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -107,12 +107,12 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-LABEL: sample_d_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_or_b32 v3, v2, v7, v9 -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1 -; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9 +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -173,12 +173,12 @@ define amdgpu_ps <4 x float> @sample_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-LABEL: sample_cd_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v6, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_or_b32 v3, v2, v6, v3 -; GFX10-NEXT: v_and_or_b32 v10, v0, v6, v1 -; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v10, v3, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_and_or_b32 v7, v0, v6, v1 +; GFX10-NEXT: v_and_or_b32 v2, v2, v6, v3 +; GFX10-NEXT: image_sample_cd_g16 v[0:3], [v7, v2, v4, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -209,9 +209,9 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_or_b32 v1, v1, v7, v2 -; GFX10-NEXT: v_and_or_b32 v3, v3, v7, v4 -; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v1, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_and_or_b32 v11, v1, v7, v2 +; GFX10-NEXT: v_and_or_b32 v2, v3, v7, v4 +; GFX10-NEXT: image_sample_c_cd_g16 v[0:3], [v0, v11, v2, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -239,12 +239,12 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10-LABEL: sample_cd_cl_2d: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff -; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v9, 16, v3 ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_or_b32 v3, v2, v7, v9 -; GFX10-NEXT: v_and_or_b32 v0, v0, v7, v1 -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_and_or_b32 v11, v0, v7, v1 +; GFX10-NEXT: v_and_or_b32 v1, v2, v7, v9 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v11, v1, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll index b62592eb1c4..6814f5bb184 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -2983,19 +2983,19 @@ define <2 x i64> @v_uaddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v10, v4 ; GFX10-NEXT: v_mov_b32_e32 v11, v5 -; GFX10-NEXT: v_mov_b32_e32 v8, v6 -; GFX10-NEXT: v_mov_b32_e32 v9, v7 +; GFX10-NEXT: v_mov_b32_e32 v15, v6 +; GFX10-NEXT: v_mov_b32_e32 v16, v7 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, v10 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v11, vcc_lo -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v8 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v3, v9, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v5, vcc_lo, v2, v15 +; GFX10-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, v3, v16, vcc_lo ; GFX10-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[10:11] -; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[2:3], v[8:9] +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, v[5:6], v[15:16] ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, -1, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, -1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, -1, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v6, -1, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result diff --git a/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index a13320bea7a..c47412b3fe9 100644 --- a/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -511,11 +511,11 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB2_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 +; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -563,11 +563,11 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB2_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 +; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv @@ -750,11 +750,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB3_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 +; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -802,11 +802,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB3_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 +; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv @@ -989,11 +989,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB4_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v0, v7 +; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -1041,11 +1041,11 @@ define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB4_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v0, v7 +; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv @@ -2064,11 +2064,11 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB10_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u32 v0, v0, v7 +; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -2116,11 +2116,11 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB10_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u32 v0, v0, v7 +; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv @@ -2799,11 +2799,11 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB14_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_and_rtn_b32 v0, v0, v7 +; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -2850,11 +2850,11 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB14_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_and_rtn_b32 v0, v0, v7 +; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv @@ -3037,11 +3037,11 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB15_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_or_rtn_b32 v0, v0, v7 +; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -3089,11 +3089,11 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB15_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_or_rtn_b32 v0, v0, v7 +; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv @@ -3276,11 +3276,11 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB16_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_xor_rtn_b32 v0, v0, v7 +; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -3328,11 +3328,11 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB16_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_xor_rtn_b32 v0, v0, v7 +; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv @@ -3512,11 +3512,11 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB17_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_i32 v0, v0, v7 +; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -3563,11 +3563,11 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB17_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_i32 v0, v0, v7 +; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv @@ -3932,11 +3932,11 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB19_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_i32 v0, v0, v7 +; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -3983,11 +3983,11 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB19_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_i32 v0, v0, v7 +; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv @@ -4355,11 +4355,11 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB21_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_max_rtn_u32 v0, v0, v7 +; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -4407,11 +4407,11 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB21_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_max_rtn_u32 v0, v0, v7 +; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv @@ -4773,11 +4773,11 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB23_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v7, s3 +; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v4, s3 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_min_rtn_u32 v0, v0, v7 +; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv @@ -4824,11 +4824,11 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB23_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v7, s3 +; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v4, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_min_rtn_u32 v0, v0, v7 +; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll index 2a132ce2ccd..6bf89434e0b 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -676,13 +676,13 @@ define amdgpu_ps <4 x float> @sample_c_d_2d(<8 x i32> inreg %rsrc, <4 x i32> inr ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v5, v10, v5 ; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 ; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 -; GFX10-NEXT: v_and_b32_e32 v5, v10, v5 +; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v6, v6, 16, v5 -; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v1, v3, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX10-NEXT: image_sample_c_d v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -730,13 +730,13 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; GFX10-NEXT: v_and_b32_e32 v4, v7, v4 ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX10-NEXT: image_sample_d_cl v[0:3], [v0, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX10-NEXT: image_sample_d_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -787,12 +787,12 @@ define amdgpu_ps <4 x float> @sample_c_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_and_b32_e32 v5, v8, v5 -; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v6, v4, 16, v3 -; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v1, v6, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX10-NEXT: image_sample_c_d_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -888,13 +888,13 @@ define amdgpu_ps <4 x float> @sample_c_cd_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v10, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v5, v10, v5 ; GFX10-NEXT: v_and_b32_e32 v3, v10, v3 ; GFX10-NEXT: v_and_b32_e32 v1, v10, v1 -; GFX10-NEXT: v_and_b32_e32 v5, v10, v5 +; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 ; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v6, v6, 16, v5 -; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v1, v3, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd v[0:3], [v0, v2, v3, v5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -942,13 +942,13 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; GFX10-NEXT: v_and_b32_e32 v4, v7, v4 ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v5, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX10-NEXT: image_sample_cd_cl v[0:3], [v0, v3, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 +; GFX10-NEXT: v_lshl_or_b32 v5, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_cl v[0:3], [v3, v5, v4, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -999,12 +999,12 @@ define amdgpu_ps <4 x float> @sample_c_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10-NEXT: v_mov_b32_e32 v8, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: v_and_b32_e32 v5, v8, v5 -; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; GFX10-NEXT: v_and_b32_e32 v3, v8, v3 +; GFX10-NEXT: v_and_b32_e32 v1, v8, v1 ; GFX10-NEXT: v_lshl_or_b32 v5, v6, 16, v5 -; GFX10-NEXT: v_lshl_or_b32 v1, v2, 16, v1 -; GFX10-NEXT: v_lshl_or_b32 v6, v4, 16, v3 -; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v1, v6, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 +; GFX10-NEXT: v_lshl_or_b32 v3, v4, 16, v3 +; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v1 +; GFX10-NEXT: image_sample_c_cd_cl v[0:3], [v0, v2, v3, v5, v7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1203,13 +1203,13 @@ define amdgpu_ps float @sample_c_d_o_2darray_V1(<8 x i32> inreg %rsrc, <4 x i32> ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v6, v9, v6 ; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 ; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 -; GFX10-NEXT: v_and_b32_e32 v6, v9, v6 +; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v7, v7, 16, v6 -; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v2, v4, v7, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX10-NEXT: image_sample_c_d_o v0, [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -1238,13 +1238,13 @@ define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v9, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_and_b32_e32 v6, v9, v6 ; GFX10-NEXT: v_and_b32_e32 v4, v9, v4 ; GFX10-NEXT: v_and_b32_e32 v2, v9, v2 -; GFX10-NEXT: v_and_b32_e32 v6, v9, v6 +; GFX10-NEXT: v_lshl_or_b32 v6, v7, 16, v6 ; GFX10-NEXT: v_lshl_or_b32 v4, v5, 16, v4 -; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; GFX10-NEXT: v_lshl_or_b32 v7, v7, 16, v6 -; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v2, v4, v7, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 +; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 +; GFX10-NEXT: image_sample_c_d_o v[0:1], [v0, v1, v3, v4, v6, v8], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY a16 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll index 49891aebe70..0ca7ed99181 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -93,11 +93,11 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; encoding: [0x03,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x00,0x00,0x40,0x00,0x03,0x04,0x05,0x06] +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0x8c,0xf0,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -209,11 +209,11 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; encoding: [0xff,0x02,0x0e,0x7e,0xff,0xff,0x00,0x00] ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 ; encoding: [0x07,0x05,0x04,0x36] -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; encoding: [0x00,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 ; encoding: [0x03,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x00,0x00,0x40,0x00,0x03,0x04,0x05,0x06] +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; encoding: [0x07,0x01,0x00,0x36] +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 ; encoding: [0x02,0x00,0x6f,0xd7,0x03,0x21,0x09,0x04] +; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 ; encoding: [0x03,0x00,0x6f,0xd7,0x01,0x21,0x01,0x04] +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; encoding: [0x0b,0x0f,0xa4,0xf1,0x03,0x00,0x40,0x00,0x02,0x04,0x05,0x06] ; GFX10-NEXT: s_waitcnt vmcnt(0) ; encoding: [0x70,0x3f,0x8c,0xbf] ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll index e4214ef54cf..7f7d5b376d7 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -93,11 +93,11 @@ define amdgpu_ps <4 x float> @sample_d_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> in ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX10-NEXT: image_sample_d_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: @@ -209,11 +209,11 @@ define amdgpu_ps <4 x float> @sample_cd_cl_2d(<8 x i32> inreg %rsrc, <4 x i32> i ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: v_mov_b32_e32 v7, 0xffff ; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 ; GFX10-NEXT: v_and_b32_e32 v2, v7, v2 -; GFX10-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX10-NEXT: v_lshl_or_b32 v3, v3, 16, v2 -; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v0, v3, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D +; GFX10-NEXT: v_and_b32_e32 v0, v7, v0 +; GFX10-NEXT: v_lshl_or_b32 v2, v3, 16, v2 +; GFX10-NEXT: v_lshl_or_b32 v3, v1, 16, v0 +; GFX10-NEXT: image_sample_cd_cl_g16 v[0:3], [v3, v2, v4, v5, v6], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog main_body: diff --git a/test/CodeGen/AMDGPU/regbank-reassign.mir b/test/CodeGen/AMDGPU/regbank-reassign.mir index 2078d8c2292..6e4838b6085 100644 --- a/test/CodeGen/AMDGPU/regbank-reassign.mir +++ b/test/CodeGen/AMDGPU/regbank-reassign.mir @@ -319,8 +319,8 @@ body: | ... # GCN-LABEL: smem_bundle{{$}} -# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr15, 0, 0 # GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr14, 0, 0 +# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr15, 0, 0 --- name: smem_bundle tracksRegLiveness: true