1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-18 18:42:46 +02:00

AMDGPU: Handle frame index expansion with no free SGPRs pre gfx9

Since an add instruction must produce an unused carry out, this
requires additional SGPRs. This can be avoided by keeping the entire
offset computation in SGPRs. If one SGPR is still available, this only
costs one extra mov. If none are available, the entire computation can
be done in place and reversed.

This does assume the use is a VGPR operand. This was already assumed,
and we currently only select frame indexes to VALU instructions. This
should probably be fixed at some point to handle more possible MIR.

llvm-svn: 370929
This commit is contained in:
Matt Arsenault 2019-09-04 17:12:57 +00:00
parent 1493567d2b
commit 52b58835e4
3 changed files with 147 additions and 22 deletions

View File

@ -6098,7 +6098,7 @@ MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
Register UnusedCarry = RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
// TODO: Users need to deal with this.
if (!UnusedCarry.isValid())
report_fatal_error("failed to scavenge unused carry-out SGPR");
return MachineInstrBuilder();
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
.addReg(UnusedCarry, RegState::Define | RegState::Dead);

View File

@ -1273,35 +1273,67 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (Offset == 0) {
// XXX - This never happens because of emergency scavenging slot at 0?
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
.addImm(Log2_32(ST.getWavefrontSize()))
.addImm(ST.getWavefrontSizeLog2())
.addReg(DiffReg);
} else {
Register ScaledReg =
RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
Register ScaledReg =
RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0);
// FIXME: Assusmed VGPR use.
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
.addImm(Log2_32(ST.getWavefrontSize()))
.addReg(DiffReg, RegState::Kill);
BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
ScaledReg)
.addImm(ST.getWavefrontSizeLog2())
.addReg(DiffReg, RegState::Kill);
// TODO: Fold if use instruction is another add of a constant.
if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
// TODO: Fold if use instruction is another add of a constant.
if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
// FIXME: This can fail
MIB.addImm(Offset);
MIB.addReg(ScaledReg, RegState::Kill);
MIB.addImm(0); // clamp bit
} else {
Register ConstOffsetReg =
RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MIB, 0, false);
// FIXME: This can fail
TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)
.addImm(Offset)
.addReg(ScaledReg, RegState::Kill)
.addImm(0); // clamp bit
// This should always be able to use the unused carry out.
assert(ConstOffsetReg && "this scavenge should not be able to fail");
BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
.addImm(Offset);
MIB.addReg(ConstOffsetReg, RegState::Kill);
MIB.addReg(ScaledReg, RegState::Kill);
MIB.addImm(0); // clamp bit
}
} else {
Register ConstOffsetReg =
RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
// We have to produce a carry out, and we there isn't a free SGPR
// pair for it. We can keep the whole computation on the SALU to
// avoid clobbering an additional register at the cost of an extra
// mov.
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
.addImm(Offset);
TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)
.addReg(ConstOffsetReg, RegState::Kill)
// We may have 1 free scratch SGPR even though a carry out is
// unavailable. Only one additional mov is needed.
Register TmpScaledReg =
RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg;
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
.addReg(DiffReg, RegState::Kill)
.addImm(ST.getWavefrontSizeLog2());
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
.addReg(ScaledReg, RegState::Kill)
.addImm(0); // clamp bit
.addImm(Offset);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
.addReg(ScaledReg, RegState::Kill);
// If there were truly no free SGPRs, we need to undo everything.
if (!TmpScaledReg.isValid()) {
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg)
.addReg(ScaledReg, RegState::Kill)
.addImm(Offset);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
.addReg(DiffReg, RegState::Kill)
.addImm(ST.getWavefrontSizeLog2());
}
}
}

View File

@ -0,0 +1,93 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck %s
# Test what happens when an SGPR is unavailable for the unused add
# carry out when materializing the frame index.
# There are truly no free SGPRs, so the entire frame index expansion
# needs to be inverted to restore the original frame register.
---
name: scavenge_sgpr_pei_no_sgprs
tracksRegLiveness: true
stack:
- { id: 0, type: default, offset: 0, size: 4, alignment: 8192 }
- { id: 1, type: default, offset: 0, size: 4, alignment: 8192 }
machineFunctionInfo:
isEntryFunction: false
scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
scratchWaveOffsetReg: $sgpr34
frameOffsetReg: $sgpr33
stackPtrOffsetReg: $sgpr32
body: |
bb.0:
liveins: $vgpr1
; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs
; CHECK: liveins: $vgpr1
; CHECK: $sgpr27 = frame-setup COPY $sgpr33
; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
; CHECK: $sgpr33 = S_LSHR_B32 killed $sgpr33, 6, implicit-def $scc
; CHECK: $sgpr33 = S_ADD_U32 killed $sgpr33, 8192, implicit-def $scc
; CHECK: $vgpr2 = COPY killed $sgpr33
; CHECK: $sgpr33 = S_SUB_U32 killed $sgpr33, 8192, implicit-def $scc
; CHECK: $sgpr33 = S_LSHL_B32 killed $sgpr33, 6, implicit-def $scc
; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc
; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
; CHECK: $sgpr33 = frame-setup COPY $sgpr27
; CHECK: S_ENDPGM 0, implicit $vcc
S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
$vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
S_ENDPGM 0, implicit $vcc
...
# One 32-bit SGPR is available for the intermediate scale computation,
# so only an extra copy to VALU is necessary.
---
name: scavenge_sgpr_pei_one_sgpr
tracksRegLiveness: true
stack:
- { id: 0, type: default, offset: 0, size: 4, alignment: 8192 }
- { id: 1, type: default, offset: 0, size: 4, alignment: 8192 }
machineFunctionInfo:
isEntryFunction: false
scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
scratchWaveOffsetReg: $sgpr34
frameOffsetReg: $sgpr33
stackPtrOffsetReg: $sgpr32
body: |
bb.0:
liveins: $vgpr1
; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr
; CHECK: liveins: $vgpr1
; CHECK: $sgpr27 = frame-setup COPY $sgpr33
; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
; CHECK: $sgpr29 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc
; CHECK: $sgpr29 = S_LSHR_B32 killed $sgpr29, 6, implicit-def $scc
; CHECK: $sgpr29 = S_ADD_U32 killed $sgpr29, 8192, implicit-def $scc
; CHECK: $vgpr2 = COPY killed $sgpr29
; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr31
; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
; CHECK: $sgpr33 = frame-setup COPY $sgpr27
; CHECK: S_ENDPGM 0, implicit $vcc
S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
$vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr31
S_ENDPGM 0, implicit $vcc
...