mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-18 18:42:46 +02:00
AMDGPU: Handle frame index expansion with no free SGPRs pre gfx9
Since an add instruction must produce an unused carry out, this requires additional SGPRs. This can be avoided by keeping the entire offset computation in SGPRs. If one SGPR is still available, this only costs one extra mov. If none are available, the entire computation can be done in place and reversed. This does assume the use is a VGPR operand. This was already assumed, and we currently only select frame indexes to VALU instructions. This should probably be fixed at some point to handle more possible MIR. llvm-svn: 370929
This commit is contained in:
parent
1493567d2b
commit
52b58835e4
@ -6098,7 +6098,7 @@ MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
|
||||
Register UnusedCarry = RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
|
||||
// TODO: Users need to deal with this.
|
||||
if (!UnusedCarry.isValid())
|
||||
report_fatal_error("failed to scavenge unused carry-out SGPR");
|
||||
return MachineInstrBuilder();
|
||||
|
||||
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
|
||||
.addReg(UnusedCarry, RegState::Define | RegState::Dead);
|
||||
|
@ -1273,35 +1273,67 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
|
||||
if (Offset == 0) {
|
||||
// XXX - This never happens because of emergency scavenging slot at 0?
|
||||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
|
||||
.addImm(Log2_32(ST.getWavefrontSize()))
|
||||
.addImm(ST.getWavefrontSizeLog2())
|
||||
.addReg(DiffReg);
|
||||
} else {
|
||||
Register ScaledReg =
|
||||
RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
|
||||
if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
|
||||
Register ScaledReg =
|
||||
RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0);
|
||||
|
||||
// FIXME: Assusmed VGPR use.
|
||||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
|
||||
.addImm(Log2_32(ST.getWavefrontSize()))
|
||||
.addReg(DiffReg, RegState::Kill);
|
||||
BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
|
||||
ScaledReg)
|
||||
.addImm(ST.getWavefrontSizeLog2())
|
||||
.addReg(DiffReg, RegState::Kill);
|
||||
|
||||
// TODO: Fold if use instruction is another add of a constant.
|
||||
if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
|
||||
// TODO: Fold if use instruction is another add of a constant.
|
||||
if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
|
||||
// FIXME: This can fail
|
||||
MIB.addImm(Offset);
|
||||
MIB.addReg(ScaledReg, RegState::Kill);
|
||||
MIB.addImm(0); // clamp bit
|
||||
} else {
|
||||
Register ConstOffsetReg =
|
||||
RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MIB, 0, false);
|
||||
|
||||
// FIXME: This can fail
|
||||
TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)
|
||||
.addImm(Offset)
|
||||
.addReg(ScaledReg, RegState::Kill)
|
||||
.addImm(0); // clamp bit
|
||||
// This should always be able to use the unused carry out.
|
||||
assert(ConstOffsetReg && "this scavenge should not be able to fail");
|
||||
|
||||
BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
|
||||
.addImm(Offset);
|
||||
MIB.addReg(ConstOffsetReg, RegState::Kill);
|
||||
MIB.addReg(ScaledReg, RegState::Kill);
|
||||
MIB.addImm(0); // clamp bit
|
||||
}
|
||||
} else {
|
||||
Register ConstOffsetReg =
|
||||
RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
|
||||
// We have to produce a carry out, and we there isn't a free SGPR
|
||||
// pair for it. We can keep the whole computation on the SALU to
|
||||
// avoid clobbering an additional register at the cost of an extra
|
||||
// mov.
|
||||
|
||||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
|
||||
.addImm(Offset);
|
||||
TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)
|
||||
.addReg(ConstOffsetReg, RegState::Kill)
|
||||
// We may have 1 free scratch SGPR even though a carry out is
|
||||
// unavailable. Only one additional mov is needed.
|
||||
Register TmpScaledReg =
|
||||
RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
|
||||
Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg;
|
||||
|
||||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
|
||||
.addReg(DiffReg, RegState::Kill)
|
||||
.addImm(ST.getWavefrontSizeLog2());
|
||||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
|
||||
.addReg(ScaledReg, RegState::Kill)
|
||||
.addImm(0); // clamp bit
|
||||
.addImm(Offset);
|
||||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
|
||||
.addReg(ScaledReg, RegState::Kill);
|
||||
|
||||
// If there were truly no free SGPRs, we need to undo everything.
|
||||
if (!TmpScaledReg.isValid()) {
|
||||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg)
|
||||
.addReg(ScaledReg, RegState::Kill)
|
||||
.addImm(Offset);
|
||||
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
|
||||
.addReg(DiffReg, RegState::Kill)
|
||||
.addImm(ST.getWavefrontSizeLog2());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
93
test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
Normal file
93
test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
Normal file
@ -0,0 +1,93 @@
|
||||
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck %s
|
||||
|
||||
# Test what happens when an SGPR is unavailable for the unused add
|
||||
# carry out when materializing the frame index.
|
||||
|
||||
|
||||
# There are truly no free SGPRs, so the entire frame index expansion
|
||||
# needs to be inverted to restore the original frame register.
|
||||
|
||||
---
|
||||
name: scavenge_sgpr_pei_no_sgprs
|
||||
tracksRegLiveness: true
|
||||
|
||||
stack:
|
||||
- { id: 0, type: default, offset: 0, size: 4, alignment: 8192 }
|
||||
- { id: 1, type: default, offset: 0, size: 4, alignment: 8192 }
|
||||
|
||||
machineFunctionInfo:
|
||||
isEntryFunction: false
|
||||
scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
scratchWaveOffsetReg: $sgpr34
|
||||
frameOffsetReg: $sgpr33
|
||||
stackPtrOffsetReg: $sgpr32
|
||||
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr1
|
||||
|
||||
; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs
|
||||
; CHECK: liveins: $vgpr1
|
||||
; CHECK: $sgpr27 = frame-setup COPY $sgpr33
|
||||
; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
|
||||
; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
|
||||
; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
|
||||
; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
|
||||
; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
|
||||
; CHECK: $sgpr33 = S_LSHR_B32 killed $sgpr33, 6, implicit-def $scc
|
||||
; CHECK: $sgpr33 = S_ADD_U32 killed $sgpr33, 8192, implicit-def $scc
|
||||
; CHECK: $vgpr2 = COPY killed $sgpr33
|
||||
; CHECK: $sgpr33 = S_SUB_U32 killed $sgpr33, 8192, implicit-def $scc
|
||||
; CHECK: $sgpr33 = S_LSHL_B32 killed $sgpr33, 6, implicit-def $scc
|
||||
; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc
|
||||
; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
|
||||
; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
|
||||
; CHECK: $sgpr33 = frame-setup COPY $sgpr27
|
||||
; CHECK: S_ENDPGM 0, implicit $vcc
|
||||
S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
|
||||
$vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
|
||||
S_ENDPGM 0, implicit $vcc
|
||||
...
|
||||
|
||||
# One 32-bit SGPR is available for the intermediate scale computation,
|
||||
# so only an extra copy to VALU is necessary.
|
||||
|
||||
---
|
||||
name: scavenge_sgpr_pei_one_sgpr
|
||||
tracksRegLiveness: true
|
||||
|
||||
stack:
|
||||
- { id: 0, type: default, offset: 0, size: 4, alignment: 8192 }
|
||||
- { id: 1, type: default, offset: 0, size: 4, alignment: 8192 }
|
||||
|
||||
machineFunctionInfo:
|
||||
isEntryFunction: false
|
||||
scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
|
||||
scratchWaveOffsetReg: $sgpr34
|
||||
frameOffsetReg: $sgpr33
|
||||
stackPtrOffsetReg: $sgpr32
|
||||
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr1
|
||||
|
||||
; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr
|
||||
; CHECK: liveins: $vgpr1
|
||||
; CHECK: $sgpr27 = frame-setup COPY $sgpr33
|
||||
; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
|
||||
; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
|
||||
; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
|
||||
; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
|
||||
; CHECK: $sgpr29 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
|
||||
; CHECK: $sgpr29 = S_LSHR_B32 killed $sgpr29, 6, implicit-def $scc
|
||||
; CHECK: $sgpr29 = S_ADD_U32 killed $sgpr29, 8192, implicit-def $scc
|
||||
; CHECK: $vgpr2 = COPY killed $sgpr29
|
||||
; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr31
|
||||
; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
|
||||
; CHECK: $sgpr33 = frame-setup COPY $sgpr27
|
||||
; CHECK: S_ENDPGM 0, implicit $vcc
|
||||
S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
|
||||
$vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr31
|
||||
S_ENDPGM 0, implicit $vcc
|
||||
...
|
Loading…
Reference in New Issue
Block a user