mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
AMDGPU: Handle s_buffer_load_dword hazard on SI
Reviewers: arsenm, nhaehnle Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye Differential Revision: https://reviews.llvm.org/D39171 llvm-svn: 316666
This commit is contained in:
parent
6a9d0663f6
commit
a99a74391b
@ -335,6 +335,18 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
|
||||
// SGPR was written by a VALU instruction.
|
||||
int SmrdSgprWaitStates = 4;
|
||||
auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
|
||||
auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
|
||||
|
||||
bool IsBufferSMRD = SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
|
||||
SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM ||
|
||||
SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM ||
|
||||
SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM ||
|
||||
SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX16_IMM ||
|
||||
SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORD_SGPR ||
|
||||
SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR ||
|
||||
SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR ||
|
||||
SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR ||
|
||||
SMRD->getOpcode() == AMDGPU::S_BUFFER_LOAD_DWORDX16_SGPR;
|
||||
|
||||
for (const MachineOperand &Use : SMRD->uses()) {
|
||||
if (!Use.isReg())
|
||||
@ -342,7 +354,22 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
|
||||
int WaitStatesNeededForUse =
|
||||
SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn);
|
||||
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
|
||||
|
||||
// This fixes what appears to be undocumented hardware behavior in SI where
|
||||
// s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
|
||||
// needs some number of nops in between. We don't know how many we need, but
|
||||
// let's use 4. This wasn't discovered before probably because the only
|
||||
// case when this happens is when we expand a 64-bit pointer into a full
|
||||
// descriptor and use s_buffer_load_dword instead of s_load_dword, which was
|
||||
// probably never encountered in the closed-source land.
|
||||
if (IsBufferSMRD) {
|
||||
int WaitStatesNeededForUse =
|
||||
SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
|
||||
IsBufferHazardDefFn);
|
||||
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
|
||||
}
|
||||
}
|
||||
|
||||
return WaitStatesNeeded;
|
||||
}
|
||||
|
||||
|
@ -84,6 +84,23 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}smrd_hazard:
|
||||
; GCN-DAG: s_mov_b32 s3, 3
|
||||
; GCN-DAG: s_mov_b32 s2, 2
|
||||
; GCN-DAG: s_mov_b32 s1, 1
|
||||
; GCN-DAG: s_mov_b32 s0, 0
|
||||
; SI-NEXT: nop 3
|
||||
; GCN-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
|
||||
define amdgpu_ps float @smrd_hazard(<4 x i32> inreg %desc) #0 {
|
||||
main_body:
|
||||
%d0 = insertelement <4 x i32> undef, i32 0, i32 0
|
||||
%d1 = insertelement <4 x i32> %d0, i32 1, i32 1
|
||||
%d2 = insertelement <4 x i32> %d1, i32 2, i32 2
|
||||
%d3 = insertelement <4 x i32> %d2, i32 3, i32 3
|
||||
%r = call float @llvm.SI.load.const.v4i32(<4 x i32> %d3, i32 0)
|
||||
ret float %r
|
||||
}
|
||||
|
||||
; SMRD load using the load.const.v4i32 intrinsic with an immediate offset
|
||||
; GCN-LABEL: {{^}}smrd_load_const0:
|
||||
; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
|
||||
|
Loading…
x
Reference in New Issue
Block a user