2012-12-11 22:25:42 +01:00
|
|
|
//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
/// \file
|
|
|
|
/// \brief SI implementation of the TargetRegisterInfo class.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "SIRegisterInfo.h"
|
2013-11-14 00:36:50 +01:00
|
|
|
#include "SIInstrInfo.h"
|
2014-07-21 17:45:01 +02:00
|
|
|
#include "SIMachineFunctionInfo.h"
|
|
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
|
|
#include "llvm/CodeGen/RegisterScavenging.h"
|
2014-08-21 22:40:54 +02:00
|
|
|
#include "llvm/IR/Function.h"
|
|
|
|
#include "llvm/IR/LLVMContext.h"
|
2012-12-11 22:25:42 +01:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.
This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.
Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.
Reviewers: mareko, arsenm, tstellarAMD, nhaehnle
Subscribers: FireBurn, kerberizer, llvm-commits, arsenm
Differential Revision: http://reviews.llvm.org/D18340
Patch By: Bas Nieuwenhuizen
llvm-svn: 266337
2016-04-14 18:27:07 +02:00
|
|
|
static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) {
|
|
|
|
const SIMachineFunctionInfo& MFI = *MF.getInfo<SIMachineFunctionInfo>();
|
|
|
|
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
|
|
|
|
unsigned SIMDPerCU = 4;
|
|
|
|
|
|
|
|
unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize();
|
|
|
|
return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) /
|
|
|
|
MaxInvocationsPerWave;
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) {
|
|
|
|
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
|
|
|
|
unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
|
|
|
|
|
|
|
|
unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment;
|
|
|
|
unsigned ReservedSGPRCount;
|
|
|
|
|
|
|
|
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
|
|
|
|
TotalSGPRCountPerSIMD = 800;
|
|
|
|
AddressableSGPRCount = 102;
|
|
|
|
SGPRUsageAlignment = 16;
|
|
|
|
ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK
|
|
|
|
} else {
|
|
|
|
TotalSGPRCountPerSIMD = 512;
|
|
|
|
AddressableSGPRCount = 104;
|
|
|
|
SGPRUsageAlignment = 8;
|
|
|
|
ReservedSGPRCount = 2; // VCC
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD);
|
|
|
|
MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment);
|
|
|
|
|
|
|
|
if (ST.hasSGPRInitBug())
|
|
|
|
MaxSGPRCount = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
|
|
|
|
|
|
|
|
return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount);
|
|
|
|
}
|
|
|
|
|
|
|
|
static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) {
|
|
|
|
unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
|
|
|
|
unsigned TotalVGPRCountPerSIMD = 256;
|
|
|
|
unsigned VGPRUsageAlignment = 4;
|
|
|
|
|
|
|
|
return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD,
|
|
|
|
VGPRUsageAlignment);
|
|
|
|
}
|
|
|
|
|
2016-03-23 02:53:22 +01:00
|
|
|
static bool hasPressureSet(const int *PSets, unsigned PSetID) {
|
|
|
|
for (unsigned i = 0; PSets[i] != -1; ++i) {
|
|
|
|
if (PSets[i] == (int)PSetID)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
|
|
|
|
BitVector &PressureSets) const {
|
|
|
|
for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) {
|
|
|
|
const int *PSets = getRegUnitPressureSets(*U);
|
|
|
|
if (hasPressureSet(PSets, PSetID)) {
|
|
|
|
PressureSets.set(PSetID);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(),
|
|
|
|
SGPRPressureSets(getNumRegPressureSets()),
|
|
|
|
VGPRPressureSets(getNumRegPressureSets()) {
|
2016-01-13 17:10:10 +01:00
|
|
|
unsigned NumRegPressureSets = getNumRegPressureSets();
|
|
|
|
|
|
|
|
SGPR32SetID = NumRegPressureSets;
|
|
|
|
VGPR32SetID = NumRegPressureSets;
|
|
|
|
for (unsigned i = 0; i < NumRegPressureSets; ++i) {
|
|
|
|
if (strncmp("SGPR_32", getRegPressureSetName(i), 7) == 0)
|
|
|
|
SGPR32SetID = i;
|
|
|
|
else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0)
|
|
|
|
VGPR32SetID = i;
|
2016-03-23 02:53:22 +01:00
|
|
|
|
|
|
|
classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
|
|
|
|
classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
|
2016-01-13 17:10:10 +01:00
|
|
|
}
|
|
|
|
assert(SGPR32SetID < NumRegPressureSets &&
|
|
|
|
VGPR32SetID < NumRegPressureSets);
|
|
|
|
}
|
2012-12-11 22:25:42 +01:00
|
|
|
|
2015-08-26 20:54:50 +02:00
|
|
|
void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
|
|
|
|
MCRegAliasIterator R(Reg, this, true);
|
2014-09-24 03:33:23 +02:00
|
|
|
|
2015-08-26 20:54:50 +02:00
|
|
|
for (; R.isValid(); ++R)
|
|
|
|
Reserved.set(*R);
|
|
|
|
}
|
2014-09-24 03:33:23 +02:00
|
|
|
|
2015-11-30 22:16:03 +01:00
|
|
|
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
|
|
|
|
const MachineFunction &MF) const {
|
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.
This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.
Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.
Reviewers: mareko, arsenm, tstellarAMD, nhaehnle
Subscribers: FireBurn, kerberizer, llvm-commits, arsenm
Differential Revision: http://reviews.llvm.org/D18340
Patch By: Bas Nieuwenhuizen
llvm-svn: 266337
2016-04-14 18:27:07 +02:00
|
|
|
unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4;
|
|
|
|
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
|
|
|
|
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
|
2015-11-30 22:16:03 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
|
|
|
|
const MachineFunction &MF) const {
|
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.
This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.
Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.
Reviewers: mareko, arsenm, tstellarAMD, nhaehnle
Subscribers: FireBurn, kerberizer, llvm-commits, arsenm
Differential Revision: http://reviews.llvm.org/D18340
Patch By: Bas Nieuwenhuizen
llvm-svn: 266337
2016-04-14 18:27:07 +02:00
|
|
|
unsigned RegCount = getMaxWorkGroupSGPRCount(MF);
|
|
|
|
unsigned Reg;
|
|
|
|
|
|
|
|
// Try to place it in a hole after PrivateSegmentbufferReg.
|
|
|
|
if (RegCount & 3) {
|
|
|
|
// We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
|
|
|
|
// alignment constraints, so we have a hole where can put the wave offset.
|
|
|
|
Reg = RegCount - 1;
|
|
|
|
} else {
|
|
|
|
// We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
|
|
|
|
// wave offset before it.
|
|
|
|
Reg = RegCount - 5;
|
2015-11-30 22:16:03 +01:00
|
|
|
}
|
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.
This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.
Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.
Reviewers: mareko, arsenm, tstellarAMD, nhaehnle
Subscribers: FireBurn, kerberizer, llvm-commits, arsenm
Differential Revision: http://reviews.llvm.org/D18340
Patch By: Bas Nieuwenhuizen
llvm-svn: 266337
2016-04-14 18:27:07 +02:00
|
|
|
return AMDGPU::SGPR_32RegClass.getRegister(Reg);
|
2015-11-30 22:16:03 +01:00
|
|
|
}
|
|
|
|
|
2015-08-26 20:54:50 +02:00
|
|
|
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
|
|
|
|
BitVector Reserved(getNumRegs());
|
2013-11-14 00:36:50 +01:00
|
|
|
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
|
2015-08-26 20:54:50 +02:00
|
|
|
|
|
|
|
// EXEC_LO and EXEC_HI could be allocated and used as regular register, but
|
|
|
|
// this seems likely to result in bugs, so I'm marking them as reserved.
|
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::EXEC);
|
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
|
2014-09-24 03:33:17 +02:00
|
|
|
|
2016-04-13 18:18:41 +02:00
|
|
|
// Reserve Trap Handler registers - support is not implemented in Codegen.
|
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::TBA);
|
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::TMA);
|
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
|
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
|
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
|
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
|
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
|
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
|
|
|
|
|
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.
This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.
Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.
Reviewers: mareko, arsenm, tstellarAMD, nhaehnle
Subscribers: FireBurn, kerberizer, llvm-commits, arsenm
Differential Revision: http://reviews.llvm.org/D18340
Patch By: Bas Nieuwenhuizen
llvm-svn: 266337
2016-04-14 18:27:07 +02:00
|
|
|
unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF);
|
|
|
|
unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF);
|
2015-11-03 23:39:50 +01:00
|
|
|
|
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.
This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.
Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.
Reviewers: mareko, arsenm, tstellarAMD, nhaehnle
Subscribers: FireBurn, kerberizer, llvm-commits, arsenm
Differential Revision: http://reviews.llvm.org/D18340
Patch By: Bas Nieuwenhuizen
llvm-svn: 266337
2016-04-14 18:27:07 +02:00
|
|
|
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
|
|
|
|
unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
|
|
|
|
for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) {
|
|
|
|
unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
|
|
|
|
reserveRegisterTuples(Reserved, Reg);
|
2015-11-03 23:39:52 +01:00
|
|
|
}
|
|
|
|
|
2016-01-05 00:35:53 +01:00
|
|
|
|
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit
Summary:
For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD.
This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions.
Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug.
Reviewers: mareko, arsenm, tstellarAMD, nhaehnle
Subscribers: FireBurn, kerberizer, llvm-commits, arsenm
Differential Revision: http://reviews.llvm.org/D18340
Patch By: Bas Nieuwenhuizen
llvm-svn: 266337
2016-04-14 18:27:07 +02:00
|
|
|
for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) {
|
|
|
|
unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
|
|
|
|
reserveRegisterTuples(Reserved, Reg);
|
2015-03-09 16:48:09 +01:00
|
|
|
}
|
|
|
|
|
2015-11-30 22:15:53 +01:00
|
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
2015-11-30 22:16:03 +01:00
|
|
|
|
|
|
|
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
|
|
|
|
if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
|
|
|
|
// Reserve 1 SGPR for scratch wave offset in case we need to spill.
|
|
|
|
reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
|
|
|
|
}
|
|
|
|
|
2015-11-30 22:15:53 +01:00
|
|
|
unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
|
|
|
|
if (ScratchRSrcReg != AMDGPU::NoRegister) {
|
|
|
|
// Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
|
|
|
|
// to spill.
|
|
|
|
// TODO: May need to reserve a VGPR if doing LDS spilling.
|
|
|
|
reserveRegisterTuples(Reserved, ScratchRSrcReg);
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
|
2015-11-30 22:15:53 +01:00
|
|
|
}
|
|
|
|
|
2016-04-26 17:43:14 +02:00
|
|
|
// Reserve VGPRs for trap handler usage if "amdgpu-debugger-reserve-trap-regs"
|
|
|
|
// attribute was specified.
|
|
|
|
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
|
|
|
|
if (ST.debuggerReserveTrapVGPRs()) {
|
2016-04-26 19:24:40 +02:00
|
|
|
unsigned ReservedVGPRFirst =
|
|
|
|
MaxWorkGroupVGPRCount - MFI->getDebuggerReserveTrapVGPRCount();
|
|
|
|
for (unsigned i = ReservedVGPRFirst; i < MaxWorkGroupVGPRCount; ++i) {
|
2016-04-26 17:43:14 +02:00
|
|
|
unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
|
|
|
|
reserveRegisterTuples(Reserved, Reg);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-12-11 22:25:42 +01:00
|
|
|
return Reserved;
|
|
|
|
}
|
|
|
|
|
2015-03-11 19:34:58 +01:00
|
|
|
unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
|
|
|
|
unsigned Idx) const {
|
2015-03-11 19:43:21 +01:00
|
|
|
const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>();
|
2015-01-29 17:55:25 +01:00
|
|
|
// FIXME: We should adjust the max number of waves based on LDS size.
|
2015-03-11 19:43:21 +01:00
|
|
|
unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(),
|
|
|
|
STI.getMaxWavesPerCU());
|
|
|
|
unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU());
|
2015-01-29 17:55:25 +01:00
|
|
|
|
2015-11-12 22:43:25 +01:00
|
|
|
unsigned VSLimit = SGPRLimit + VGPRLimit;
|
|
|
|
|
2016-03-23 02:53:22 +01:00
|
|
|
if (SGPRPressureSets.test(Idx) && VGPRPressureSets.test(Idx)) {
|
|
|
|
// FIXME: This is a hack. We should never be considering the pressure of
|
|
|
|
// these since no virtual register should ever have this class.
|
|
|
|
return VSLimit;
|
2015-01-29 17:55:25 +01:00
|
|
|
}
|
2016-03-23 02:53:22 +01:00
|
|
|
|
|
|
|
if (SGPRPressureSets.test(Idx))
|
|
|
|
return SGPRLimit;
|
|
|
|
|
|
|
|
return VGPRLimit;
|
2013-03-26 15:04:02 +01:00
|
|
|
}
|
|
|
|
|
2014-07-21 17:45:01 +02:00
|
|
|
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
|
|
|
|
return Fn.getFrameInfo()->hasStackObjects();
|
|
|
|
}
|
|
|
|
|
2016-03-04 19:02:01 +01:00
|
|
|
bool
|
|
|
|
SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
|
|
|
|
return MF.getFrameInfo()->hasStackObjects();
|
|
|
|
}
|
|
|
|
|
2016-04-16 04:13:37 +02:00
|
|
|
bool SIRegisterInfo::requiresVirtualBaseRegisters(
|
|
|
|
const MachineFunction &) const {
|
|
|
|
// There are no special dedicated stack or frame pointers.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
|
|
|
|
int Idx) const {
|
2016-04-18 16:47:19 +02:00
|
|
|
if (!SIInstrInfo::isMUBUF(*MI))
|
2016-04-16 04:13:37 +02:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
|
|
|
|
AMDGPU::OpName::vaddr) &&
|
|
|
|
"Should never see frame index on non-address operand");
|
|
|
|
|
|
|
|
int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
|
|
|
|
AMDGPU::OpName::offset);
|
|
|
|
return MI->getOperand(OffIdx).getImm();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
|
|
|
|
return MI->mayLoadOrStore();
|
|
|
|
}
|
|
|
|
|
|
|
|
void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
|
|
|
|
unsigned BaseReg,
|
|
|
|
int FrameIdx,
|
|
|
|
int64_t Offset) const {
|
|
|
|
MachineBasicBlock::iterator Ins = MBB->begin();
|
|
|
|
DebugLoc DL; // Defaults to "unknown"
|
|
|
|
|
|
|
|
if (Ins != MBB->end())
|
|
|
|
DL = Ins->getDebugLoc();
|
|
|
|
|
|
|
|
MachineFunction *MF = MBB->getParent();
|
|
|
|
const AMDGPUSubtarget &Subtarget = MF->getSubtarget<AMDGPUSubtarget>();
|
|
|
|
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
|
|
|
|
|
|
|
|
assert(isUInt<27>(Offset) &&
|
|
|
|
"Private offset should never exceed maximum private size");
|
|
|
|
|
|
|
|
|
|
|
|
if (Offset == 0) {
|
|
|
|
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
|
|
|
|
.addFrameIndex(FrameIdx);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
|
|
|
|
|
|
|
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_ADD_I32_e64), BaseReg)
|
|
|
|
.addReg(UnusedCarry, RegState::Define | RegState::Dead)
|
|
|
|
.addImm(Offset)
|
|
|
|
.addFrameIndex(FrameIdx);
|
|
|
|
}
|
|
|
|
|
|
|
|
void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
|
|
|
|
int64_t Offset) const {
|
|
|
|
|
|
|
|
MachineBasicBlock *MBB = MI.getParent();
|
|
|
|
MachineFunction *MF = MBB->getParent();
|
|
|
|
const AMDGPUSubtarget &Subtarget = MF->getSubtarget<AMDGPUSubtarget>();
|
|
|
|
const SIInstrInfo *TII
|
|
|
|
= static_cast<const SIInstrInfo *>(Subtarget.getInstrInfo());
|
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
// FIXME: Is it possible to be storing a frame index to itself?
|
|
|
|
bool SeenFI = false;
|
|
|
|
for (const MachineOperand &MO: MI.operands()) {
|
|
|
|
if (MO.isFI()) {
|
|
|
|
if (SeenFI)
|
|
|
|
llvm_unreachable("should not see multiple frame indices");
|
|
|
|
|
|
|
|
SeenFI = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
|
|
|
|
assert(FIOp && FIOp->isFI() && "frame index must be address operand");
|
|
|
|
|
|
|
|
assert(TII->isMUBUF(MI));
|
|
|
|
|
|
|
|
MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
|
|
|
|
int64_t NewOffset = OffsetOp->getImm() + Offset;
|
|
|
|
if (isUInt<12>(NewOffset)) {
|
|
|
|
// If we have a legal offset, fold it directly into the instruction.
|
|
|
|
FIOp->ChangeToRegister(BaseReg, false);
|
|
|
|
OffsetOp->setImm(NewOffset);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The offset is not legal, so we must insert an add of the offset.
|
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
unsigned NewReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
|
|
|
DebugLoc DL = MI.getDebugLoc();
|
|
|
|
|
|
|
|
assert(Offset != 0 && "Non-zero offset expected");
|
|
|
|
|
|
|
|
unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
|
|
|
|
|
|
|
|
// In the case the instruction already had an immediate offset, here only
|
|
|
|
// the requested new offset is added because we are leaving the original
|
|
|
|
// immediate in place.
|
|
|
|
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_I32_e64), NewReg)
|
|
|
|
.addReg(UnusedCarry, RegState::Define | RegState::Dead)
|
|
|
|
.addImm(Offset)
|
|
|
|
.addReg(BaseReg);
|
|
|
|
|
|
|
|
FIOp->ChangeToRegister(NewReg, false);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
|
|
|
|
unsigned BaseReg,
|
|
|
|
int64_t Offset) const {
|
2016-04-18 16:47:19 +02:00
|
|
|
return SIInstrInfo::isMUBUF(*MI) && isUInt<12>(Offset);
|
2016-04-16 04:13:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
|
|
|
|
const MachineFunction &MF, unsigned Kind) const {
|
|
|
|
// This is inaccurate. It depends on the instruction and address space. The
|
|
|
|
// only place where we should hit this is for dealing with frame indexes /
|
|
|
|
// private accesses, so this is correct in that case.
|
|
|
|
return &AMDGPU::VGPR_32RegClass;
|
|
|
|
}
|
|
|
|
|
2014-08-21 22:40:54 +02:00
|
|
|
static unsigned getNumSubRegsForSpillOp(unsigned Op) {
|
|
|
|
|
|
|
|
switch (Op) {
|
|
|
|
case AMDGPU::SI_SPILL_S512_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S512_RESTORE:
|
2014-09-24 03:33:17 +02:00
|
|
|
case AMDGPU::SI_SPILL_V512_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V512_RESTORE:
|
2014-08-21 22:40:54 +02:00
|
|
|
return 16;
|
|
|
|
case AMDGPU::SI_SPILL_S256_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S256_RESTORE:
|
2014-09-24 03:33:17 +02:00
|
|
|
case AMDGPU::SI_SPILL_V256_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V256_RESTORE:
|
2014-08-21 22:40:54 +02:00
|
|
|
return 8;
|
|
|
|
case AMDGPU::SI_SPILL_S128_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S128_RESTORE:
|
2014-09-24 03:33:17 +02:00
|
|
|
case AMDGPU::SI_SPILL_V128_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V128_RESTORE:
|
2014-08-21 22:40:54 +02:00
|
|
|
return 4;
|
2014-09-24 03:33:17 +02:00
|
|
|
case AMDGPU::SI_SPILL_V96_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V96_RESTORE:
|
|
|
|
return 3;
|
2014-08-21 22:40:54 +02:00
|
|
|
case AMDGPU::SI_SPILL_S64_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S64_RESTORE:
|
2014-09-24 03:33:17 +02:00
|
|
|
case AMDGPU::SI_SPILL_V64_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V64_RESTORE:
|
2014-08-21 22:40:54 +02:00
|
|
|
return 2;
|
|
|
|
case AMDGPU::SI_SPILL_S32_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S32_RESTORE:
|
2014-09-24 03:33:17 +02:00
|
|
|
case AMDGPU::SI_SPILL_V32_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V32_RESTORE:
|
2014-08-21 22:40:54 +02:00
|
|
|
return 1;
|
|
|
|
default: llvm_unreachable("Invalid spill opcode");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-14 16:42:31 +01:00
|
|
|
void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
|
|
|
|
unsigned LoadStoreOp,
|
|
|
|
unsigned Value,
|
2015-01-20 18:49:47 +01:00
|
|
|
unsigned ScratchRsrcReg,
|
2015-01-14 16:42:31 +01:00
|
|
|
unsigned ScratchOffset,
|
2016-04-13 22:44:16 +02:00
|
|
|
int64_t Offset,
|
|
|
|
RegScavenger *RS) const {
|
2015-01-14 16:42:31 +01:00
|
|
|
|
|
|
|
MachineBasicBlock *MBB = MI->getParent();
|
2016-03-04 19:02:01 +01:00
|
|
|
MachineFunction *MF = MI->getParent()->getParent();
|
2015-03-11 19:43:21 +01:00
|
|
|
const SIInstrInfo *TII =
|
|
|
|
static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo());
|
2015-01-14 16:42:31 +01:00
|
|
|
DebugLoc DL = MI->getDebugLoc();
|
AMDGPU/SI: add llvm.amdgcn.buffer.load/store.format intrinsics
Summary:
They correspond to BUFFER_LOAD/STORE_FORMAT_XYZW and will be used by Mesa
to implement the GL_ARB_shader_image_load_store extension.
The intention is that for llvm.amdgcn.buffer.load.format, LLVM will decide
whether one of the _X/_XY/_XYZ opcodes can be used (similar to image sampling
and loads). However, this is not currently implemented.
For llvm.amdgcn.buffer.store, LLVM cannot decide to use one of the "smaller"
opcodes and therefore the intrinsic is overloaded. Currently, only the v4f32
is actually implemented since GLSL also only has a vec4 variant of the store
instructions, although it's conceivable that Mesa will want to be smarter
about this in the future.
BUFFER_LOAD_FORMAT_XYZW is already exposed via llvm.SI.vs.load.input, which
has a legacy name, pretends not to access memory, and does not capture the
full flexibility of the instruction.
Reviewers: arsenm, tstellarAMD, mareko
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D17277
llvm-svn: 263140
2016-03-10 19:43:50 +01:00
|
|
|
bool IsStore = TII->get(LoadStoreOp).mayStore();
|
2015-01-14 16:42:31 +01:00
|
|
|
|
|
|
|
bool RanOutOfSGPRs = false;
|
2016-02-10 21:13:58 +01:00
|
|
|
bool Scavenged = false;
|
2015-01-14 16:42:31 +01:00
|
|
|
unsigned SOffset = ScratchOffset;
|
2016-04-13 22:44:16 +02:00
|
|
|
unsigned OriginalImmOffset = Offset;
|
2015-01-14 16:42:31 +01:00
|
|
|
|
|
|
|
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
|
|
|
|
unsigned Size = NumSubRegs * 4;
|
|
|
|
|
|
|
|
if (!isUInt<12>(Offset + Size)) {
|
2016-04-13 22:44:16 +02:00
|
|
|
SOffset = AMDGPU::NoRegister;
|
|
|
|
|
|
|
|
// We don't have access to the register scavenger if this function is called
|
|
|
|
// during PEI::scavengeFrameVirtualRegs().
|
|
|
|
if (RS)
|
|
|
|
SOffset = RS->FindUnusedReg(&AMDGPU::SGPR_32RegClass);
|
|
|
|
|
2015-01-14 16:42:31 +01:00
|
|
|
if (SOffset == AMDGPU::NoRegister) {
|
2016-04-13 22:44:16 +02:00
|
|
|
// There are no free SGPRs, and since we are in the process of spilling
|
|
|
|
// VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
|
|
|
|
// on SI/CI and on VI it is true until we implement spilling using scalar
|
|
|
|
// stores), we have no way to free up an SGPR. Our solution here is to
|
|
|
|
// add the offset directly to the ScratchOffset register, and then
|
|
|
|
// subtract the offset after the spill to return ScratchOffset to it's
|
|
|
|
// original value.
|
2015-01-14 16:42:31 +01:00
|
|
|
RanOutOfSGPRs = true;
|
2016-04-13 22:44:16 +02:00
|
|
|
SOffset = ScratchOffset;
|
2016-02-10 21:13:58 +01:00
|
|
|
} else {
|
|
|
|
Scavenged = true;
|
2015-01-14 16:42:31 +01:00
|
|
|
}
|
|
|
|
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
|
|
|
|
.addReg(ScratchOffset)
|
|
|
|
.addImm(Offset);
|
|
|
|
Offset = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
|
|
|
|
unsigned SubReg = NumSubRegs > 1 ?
|
|
|
|
getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
|
|
|
|
Value;
|
|
|
|
|
2016-02-10 21:13:58 +01:00
|
|
|
unsigned SOffsetRegState = 0;
|
|
|
|
if (i + 1 == e && Scavenged)
|
|
|
|
SOffsetRegState |= RegState::Kill;
|
|
|
|
|
2015-01-14 16:42:31 +01:00
|
|
|
BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
|
AMDGPU/SI: add llvm.amdgcn.buffer.load/store.format intrinsics
Summary:
They correspond to BUFFER_LOAD/STORE_FORMAT_XYZW and will be used by Mesa
to implement the GL_ARB_shader_image_load_store extension.
The intention is that for llvm.amdgcn.buffer.load.format, LLVM will decide
whether one of the _X/_XY/_XYZ opcodes can be used (similar to image sampling
and loads). However, this is not currently implemented.
For llvm.amdgcn.buffer.store, LLVM cannot decide to use one of the "smaller"
opcodes and therefore the intrinsic is overloaded. Currently, only the v4f32
is actually implemented since GLSL also only has a vec4 variant of the store
instructions, although it's conceivable that Mesa will want to be smarter
about this in the future.
BUFFER_LOAD_FORMAT_XYZW is already exposed via llvm.SI.vs.load.input, which
has a legacy name, pretends not to access memory, and does not capture the
full flexibility of the instruction.
Reviewers: arsenm, tstellarAMD, mareko
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D17277
llvm-svn: 263140
2016-03-10 19:43:50 +01:00
|
|
|
.addReg(SubReg, getDefRegState(!IsStore))
|
2015-11-30 22:16:03 +01:00
|
|
|
.addReg(ScratchRsrcReg)
|
2016-02-10 21:13:58 +01:00
|
|
|
.addReg(SOffset, SOffsetRegState)
|
2015-08-29 08:48:57 +02:00
|
|
|
.addImm(Offset)
|
|
|
|
.addImm(0) // glc
|
|
|
|
.addImm(0) // slc
|
|
|
|
.addImm(0) // tfe
|
AMDGPU/SI: add llvm.amdgcn.buffer.load/store.format intrinsics
Summary:
They correspond to BUFFER_LOAD/STORE_FORMAT_XYZW and will be used by Mesa
to implement the GL_ARB_shader_image_load_store extension.
The intention is that for llvm.amdgcn.buffer.load.format, LLVM will decide
whether one of the _X/_XY/_XYZ opcodes can be used (similar to image sampling
and loads). However, this is not currently implemented.
For llvm.amdgcn.buffer.store, LLVM cannot decide to use one of the "smaller"
opcodes and therefore the intrinsic is overloaded. Currently, only the v4f32
is actually implemented since GLSL also only has a vec4 variant of the store
instructions, although it's conceivable that Mesa will want to be smarter
about this in the future.
BUFFER_LOAD_FORMAT_XYZW is already exposed via llvm.SI.vs.load.input, which
has a legacy name, pretends not to access memory, and does not capture the
full flexibility of the instruction.
Reviewers: arsenm, tstellarAMD, mareko
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D17277
llvm-svn: 263140
2016-03-10 19:43:50 +01:00
|
|
|
.addReg(Value, RegState::Implicit | getDefRegState(!IsStore))
|
2015-08-29 08:48:57 +02:00
|
|
|
.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
|
2015-01-14 16:42:31 +01:00
|
|
|
}
|
2016-04-13 22:44:16 +02:00
|
|
|
|
|
|
|
if (RanOutOfSGPRs) {
|
|
|
|
// Subtract the offset we added to the ScratchOffset register.
|
|
|
|
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffset)
|
|
|
|
.addReg(ScratchOffset)
|
|
|
|
.addImm(OriginalImmOffset);
|
|
|
|
}
|
2015-01-14 16:42:31 +01:00
|
|
|
}
|
|
|
|
|
2014-07-21 17:45:01 +02:00
|
|
|
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
|
|
|
|
int SPAdj, unsigned FIOperandNum,
|
|
|
|
RegScavenger *RS) const {
|
|
|
|
MachineFunction *MF = MI->getParent()->getParent();
|
2016-03-04 19:02:01 +01:00
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
2014-08-21 22:40:54 +02:00
|
|
|
MachineBasicBlock *MBB = MI->getParent();
|
|
|
|
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
|
2014-07-21 17:45:01 +02:00
|
|
|
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
|
2015-03-11 19:43:21 +01:00
|
|
|
const SIInstrInfo *TII =
|
|
|
|
static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo());
|
2014-08-21 22:40:54 +02:00
|
|
|
DebugLoc DL = MI->getDebugLoc();
|
|
|
|
|
2014-07-21 17:45:01 +02:00
|
|
|
MachineOperand &FIOp = MI->getOperand(FIOperandNum);
|
|
|
|
int Index = MI->getOperand(FIOperandNum).getIndex();
|
2014-08-21 22:40:54 +02:00
|
|
|
|
|
|
|
switch (MI->getOpcode()) {
|
|
|
|
// SGPR register spill
|
|
|
|
case AMDGPU::SI_SPILL_S512_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S256_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S128_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S64_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S32_SAVE: {
|
|
|
|
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
|
2016-03-04 19:31:18 +01:00
|
|
|
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
2014-08-21 22:40:54 +02:00
|
|
|
|
|
|
|
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
|
|
|
|
unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
|
|
|
|
&AMDGPU::SGPR_32RegClass, i);
|
|
|
|
struct SIMachineFunctionInfo::SpilledReg Spill =
|
|
|
|
MFI->getSpilledReg(MF, Index, i);
|
|
|
|
|
2016-03-04 19:31:18 +01:00
|
|
|
if (Spill.hasReg()) {
|
|
|
|
BuildMI(*MBB, MI, DL,
|
|
|
|
TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
|
|
|
|
Spill.VGPR)
|
|
|
|
.addReg(SubReg)
|
|
|
|
.addImm(Spill.Lane);
|
|
|
|
|
|
|
|
// FIXME: Since this spills to another register instead of an actual
|
|
|
|
// frame index, we should delete the frame index when all references to
|
|
|
|
// it are fixed.
|
|
|
|
} else {
|
|
|
|
// Spill SGPR to a frame index.
|
|
|
|
// FIXME we should use S_STORE_DWORD here for VI.
|
|
|
|
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
|
|
|
|
.addReg(SubReg);
|
|
|
|
|
|
|
|
unsigned Size = FrameInfo->getObjectSize(Index);
|
|
|
|
unsigned Align = FrameInfo->getObjectAlignment(Index);
|
|
|
|
MachinePointerInfo PtrInfo
|
|
|
|
= MachinePointerInfo::getFixedStack(*MF, Index);
|
|
|
|
MachineMemOperand *MMO
|
|
|
|
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
|
|
|
|
Size, Align);
|
|
|
|
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
|
|
|
|
.addReg(TmpReg) // src
|
|
|
|
.addFrameIndex(Index) // frame_idx
|
|
|
|
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
|
|
|
|
.addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
|
|
|
|
.addImm(i * 4) // offset
|
|
|
|
.addMemOperand(MMO);
|
|
|
|
}
|
2014-08-21 22:40:54 +02:00
|
|
|
}
|
|
|
|
MI->eraseFromParent();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// SGPR register restore
|
|
|
|
case AMDGPU::SI_SPILL_S512_RESTORE:
|
|
|
|
case AMDGPU::SI_SPILL_S256_RESTORE:
|
|
|
|
case AMDGPU::SI_SPILL_S128_RESTORE:
|
|
|
|
case AMDGPU::SI_SPILL_S64_RESTORE:
|
|
|
|
case AMDGPU::SI_SPILL_S32_RESTORE: {
|
|
|
|
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
|
2016-03-04 19:31:18 +01:00
|
|
|
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
2014-08-21 22:40:54 +02:00
|
|
|
|
|
|
|
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
|
|
|
|
unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
|
|
|
|
&AMDGPU::SGPR_32RegClass, i);
|
|
|
|
struct SIMachineFunctionInfo::SpilledReg Spill =
|
|
|
|
MFI->getSpilledReg(MF, Index, i);
|
|
|
|
|
2016-03-04 19:31:18 +01:00
|
|
|
if (Spill.hasReg()) {
|
|
|
|
BuildMI(*MBB, MI, DL,
|
|
|
|
TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
|
|
|
|
SubReg)
|
|
|
|
.addReg(Spill.VGPR)
|
|
|
|
.addImm(Spill.Lane)
|
|
|
|
.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
|
|
|
|
} else {
|
|
|
|
// Restore SGPR from a stack slot.
|
|
|
|
// FIXME: We should use S_LOAD_DWORD here for VI.
|
|
|
|
|
|
|
|
unsigned Align = FrameInfo->getObjectAlignment(Index);
|
|
|
|
unsigned Size = FrameInfo->getObjectSize(Index);
|
|
|
|
|
|
|
|
MachinePointerInfo PtrInfo
|
|
|
|
= MachinePointerInfo::getFixedStack(*MF, Index);
|
|
|
|
|
|
|
|
MachineMemOperand *MMO = MF->getMachineMemOperand(
|
|
|
|
PtrInfo, MachineMemOperand::MOLoad, Size, Align);
|
|
|
|
|
|
|
|
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
|
|
|
|
.addFrameIndex(Index) // frame_idx
|
|
|
|
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
|
|
|
|
.addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
|
|
|
|
.addImm(i * 4) // offset
|
|
|
|
.addMemOperand(MMO);
|
|
|
|
BuildMI(*MBB, MI, DL,
|
|
|
|
TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg)
|
2016-05-02 21:37:56 +02:00
|
|
|
.addReg(TmpReg, RegState::Kill)
|
2016-03-04 19:31:18 +01:00
|
|
|
.addImm(0)
|
|
|
|
.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
|
|
|
|
}
|
2014-08-21 22:40:54 +02:00
|
|
|
}
|
2015-03-24 14:40:38 +01:00
|
|
|
|
2014-08-21 22:40:54 +02:00
|
|
|
MI->eraseFromParent();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-09-24 03:33:17 +02:00
|
|
|
// VGPR register spill
|
|
|
|
case AMDGPU::SI_SPILL_V512_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V256_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V128_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V96_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V64_SAVE:
|
2015-01-14 16:42:31 +01:00
|
|
|
case AMDGPU::SI_SPILL_V32_SAVE:
|
|
|
|
buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
|
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(),
|
2015-01-20 18:49:47 +01:00
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
|
2015-01-14 16:42:31 +01:00
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
|
2016-03-04 19:31:18 +01:00
|
|
|
FrameInfo->getObjectOffset(Index) +
|
2016-04-13 22:44:16 +02:00
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
|
2014-09-24 03:33:17 +02:00
|
|
|
MI->eraseFromParent();
|
|
|
|
break;
|
|
|
|
case AMDGPU::SI_SPILL_V32_RESTORE:
|
|
|
|
case AMDGPU::SI_SPILL_V64_RESTORE:
|
2015-01-30 22:51:51 +01:00
|
|
|
case AMDGPU::SI_SPILL_V96_RESTORE:
|
2014-09-24 03:33:17 +02:00
|
|
|
case AMDGPU::SI_SPILL_V128_RESTORE:
|
|
|
|
case AMDGPU::SI_SPILL_V256_RESTORE:
|
|
|
|
case AMDGPU::SI_SPILL_V512_RESTORE: {
|
2015-01-14 16:42:31 +01:00
|
|
|
buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
|
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(),
|
2015-01-20 18:49:47 +01:00
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
|
2015-01-14 16:42:31 +01:00
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
|
2016-03-04 19:31:18 +01:00
|
|
|
FrameInfo->getObjectOffset(Index) +
|
2016-04-13 22:44:16 +02:00
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), RS);
|
2014-09-24 03:33:17 +02:00
|
|
|
MI->eraseFromParent();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-08-21 22:40:54 +02:00
|
|
|
default: {
|
|
|
|
int64_t Offset = FrameInfo->getObjectOffset(Index);
|
|
|
|
FIOp.ChangeToImmediate(Offset);
|
|
|
|
if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
|
2016-03-04 19:02:01 +01:00
|
|
|
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
2014-08-21 22:40:54 +02:00
|
|
|
BuildMI(*MBB, MI, MI->getDebugLoc(),
|
|
|
|
TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
|
|
|
|
.addImm(Offset);
|
2015-01-20 18:49:45 +01:00
|
|
|
FIOp.ChangeToRegister(TmpReg, false, false, true);
|
2014-08-21 22:40:54 +02:00
|
|
|
}
|
|
|
|
}
|
2014-07-21 17:45:01 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-14 00:36:50 +01:00
|
|
|
unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
|
2014-03-31 16:01:52 +02:00
|
|
|
return getEncodingValue(Reg) & 0xff;
|
2013-11-14 00:36:50 +01:00
|
|
|
}
|
|
|
|
|
2015-10-01 23:43:15 +02:00
|
|
|
// FIXME: This is very slow. It might be worth creating a map from physreg to
|
|
|
|
// register class.
|
2013-08-07 01:08:18 +02:00
|
|
|
const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
|
|
|
|
assert(!TargetRegisterInfo::isVirtualRegister(Reg));
|
|
|
|
|
2015-10-18 07:15:34 +02:00
|
|
|
static const TargetRegisterClass *const BaseClasses[] = {
|
2015-01-07 21:59:25 +01:00
|
|
|
&AMDGPU::VGPR_32RegClass,
|
2013-08-07 01:08:18 +02:00
|
|
|
&AMDGPU::SReg_32RegClass,
|
|
|
|
&AMDGPU::VReg_64RegClass,
|
|
|
|
&AMDGPU::SReg_64RegClass,
|
2014-09-24 03:33:17 +02:00
|
|
|
&AMDGPU::VReg_96RegClass,
|
|
|
|
&AMDGPU::VReg_128RegClass,
|
2013-08-07 01:08:18 +02:00
|
|
|
&AMDGPU::SReg_128RegClass,
|
2014-09-24 03:33:17 +02:00
|
|
|
&AMDGPU::VReg_256RegClass,
|
|
|
|
&AMDGPU::SReg_256RegClass,
|
2015-08-14 21:46:05 +02:00
|
|
|
&AMDGPU::VReg_512RegClass,
|
2016-02-13 00:45:29 +01:00
|
|
|
&AMDGPU::SReg_512RegClass,
|
|
|
|
&AMDGPU::SCC_CLASSRegClass,
|
2013-08-07 01:08:18 +02:00
|
|
|
};
|
|
|
|
|
2014-05-12 21:23:21 +02:00
|
|
|
for (const TargetRegisterClass *BaseClass : BaseClasses) {
|
|
|
|
if (BaseClass->contains(Reg)) {
|
|
|
|
return BaseClass;
|
2013-08-07 01:08:18 +02:00
|
|
|
}
|
|
|
|
}
|
2014-04-25 07:30:21 +02:00
|
|
|
return nullptr;
|
2013-08-07 01:08:18 +02:00
|
|
|
}
|
2013-08-15 01:24:24 +02:00
|
|
|
|
2015-09-26 06:59:04 +02:00
|
|
|
// TODO: It might be helpful to have some target specific flags in
|
|
|
|
// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
|
2013-11-14 00:36:37 +01:00
|
|
|
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
|
2015-09-26 06:59:04 +02:00
|
|
|
switch (RC->getSize()) {
|
2016-02-13 00:45:29 +01:00
|
|
|
case 0: return false;
|
|
|
|
case 1: return false;
|
2015-09-26 06:59:04 +02:00
|
|
|
case 4:
|
|
|
|
return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
|
|
|
|
case 8:
|
|
|
|
return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
|
|
|
|
case 12:
|
|
|
|
return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
|
|
|
|
case 16:
|
|
|
|
return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
|
|
|
|
case 32:
|
|
|
|
return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
|
|
|
|
case 64:
|
|
|
|
return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid register class size");
|
|
|
|
}
|
2013-11-14 00:36:37 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
|
|
|
|
const TargetRegisterClass *SRC) const {
|
2015-09-26 06:59:04 +02:00
|
|
|
switch (SRC->getSize()) {
|
|
|
|
case 4:
|
|
|
|
return &AMDGPU::VGPR_32RegClass;
|
|
|
|
case 8:
|
|
|
|
return &AMDGPU::VReg_64RegClass;
|
|
|
|
case 12:
|
|
|
|
return &AMDGPU::VReg_96RegClass;
|
|
|
|
case 16:
|
|
|
|
return &AMDGPU::VReg_128RegClass;
|
|
|
|
case 32:
|
|
|
|
return &AMDGPU::VReg_256RegClass;
|
|
|
|
case 64:
|
|
|
|
return &AMDGPU::VReg_512RegClass;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid register class size");
|
|
|
|
}
|
2013-11-14 00:36:37 +01:00
|
|
|
}
|
|
|
|
|
2016-02-11 22:45:07 +01:00
|
|
|
const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
|
|
|
|
const TargetRegisterClass *VRC) const {
|
|
|
|
switch (VRC->getSize()) {
|
|
|
|
case 4:
|
|
|
|
return &AMDGPU::SGPR_32RegClass;
|
|
|
|
case 8:
|
|
|
|
return &AMDGPU::SReg_64RegClass;
|
|
|
|
case 16:
|
|
|
|
return &AMDGPU::SReg_128RegClass;
|
|
|
|
case 32:
|
|
|
|
return &AMDGPU::SReg_256RegClass;
|
|
|
|
case 64:
|
|
|
|
return &AMDGPU::SReg_512RegClass;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid register class size");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-14 00:36:37 +01:00
|
|
|
const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
|
|
|
|
const TargetRegisterClass *RC, unsigned SubIdx) const {
|
|
|
|
if (SubIdx == AMDGPU::NoSubRegister)
|
|
|
|
return RC;
|
|
|
|
|
2016-01-07 18:10:29 +01:00
|
|
|
// We can assume that each lane corresponds to one 32-bit register.
|
|
|
|
unsigned Count = countPopulation(getSubRegIndexLaneMask(SubIdx));
|
2013-11-14 00:36:37 +01:00
|
|
|
if (isSGPRClass(RC)) {
|
2016-01-07 18:10:29 +01:00
|
|
|
switch (Count) {
|
|
|
|
case 1:
|
|
|
|
return &AMDGPU::SGPR_32RegClass;
|
|
|
|
case 2:
|
|
|
|
return &AMDGPU::SReg_64RegClass;
|
|
|
|
case 4:
|
|
|
|
return &AMDGPU::SReg_128RegClass;
|
|
|
|
case 8:
|
|
|
|
return &AMDGPU::SReg_256RegClass;
|
|
|
|
case 16: /* fall-through */
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid sub-register class size");
|
|
|
|
}
|
2013-11-14 00:36:37 +01:00
|
|
|
} else {
|
2016-01-07 18:10:29 +01:00
|
|
|
switch (Count) {
|
|
|
|
case 1:
|
|
|
|
return &AMDGPU::VGPR_32RegClass;
|
|
|
|
case 2:
|
|
|
|
return &AMDGPU::VReg_64RegClass;
|
|
|
|
case 3:
|
|
|
|
return &AMDGPU::VReg_96RegClass;
|
|
|
|
case 4:
|
|
|
|
return &AMDGPU::VReg_128RegClass;
|
|
|
|
case 8:
|
|
|
|
return &AMDGPU::VReg_256RegClass;
|
|
|
|
case 16: /* fall-through */
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid sub-register class size");
|
|
|
|
}
|
2013-11-14 00:36:37 +01:00
|
|
|
}
|
2013-08-15 01:24:24 +02:00
|
|
|
}
|
2014-05-02 17:41:42 +02:00
|
|
|
|
2015-09-24 10:36:14 +02:00
|
|
|
bool SIRegisterInfo::shouldRewriteCopySrc(
|
|
|
|
const TargetRegisterClass *DefRC,
|
|
|
|
unsigned DefSubReg,
|
|
|
|
const TargetRegisterClass *SrcRC,
|
|
|
|
unsigned SrcSubReg) const {
|
|
|
|
// We want to prefer the smallest register class possible, so we don't want to
|
|
|
|
// stop and rewrite on anything that looks like a subregister
|
|
|
|
// extract. Operations mostly don't care about the super register class, so we
|
|
|
|
// only want to stop on the most basic of copies between the smae register
|
|
|
|
// class.
|
|
|
|
//
|
|
|
|
// e.g. if we have something like
|
|
|
|
// vreg0 = ...
|
|
|
|
// vreg1 = ...
|
|
|
|
// vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2
|
|
|
|
// vreg3 = COPY vreg2, sub0
|
|
|
|
//
|
|
|
|
// We want to look through the COPY to find:
|
|
|
|
// => vreg3 = COPY vreg0
|
|
|
|
|
|
|
|
// Plain copy.
|
|
|
|
return getCommonSubClass(DefRC, SrcRC) != nullptr;
|
|
|
|
}
|
|
|
|
|
2014-05-02 17:41:42 +02:00
|
|
|
unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
|
|
|
|
const TargetRegisterClass *SubRC,
|
|
|
|
unsigned Channel) const {
|
2014-08-21 22:40:50 +02:00
|
|
|
|
|
|
|
switch (Reg) {
|
|
|
|
case AMDGPU::VCC:
|
|
|
|
switch(Channel) {
|
|
|
|
case 0: return AMDGPU::VCC_LO;
|
|
|
|
case 1: return AMDGPU::VCC_HI;
|
2016-04-13 18:18:41 +02:00
|
|
|
default: llvm_unreachable("Invalid SubIdx for VCC"); break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case AMDGPU::TBA:
|
|
|
|
switch(Channel) {
|
|
|
|
case 0: return AMDGPU::TBA_LO;
|
|
|
|
case 1: return AMDGPU::TBA_HI;
|
|
|
|
default: llvm_unreachable("Invalid SubIdx for TBA"); break;
|
|
|
|
}
|
|
|
|
|
|
|
|
case AMDGPU::TMA:
|
|
|
|
switch(Channel) {
|
|
|
|
case 0: return AMDGPU::TMA_LO;
|
|
|
|
case 1: return AMDGPU::TMA_HI;
|
|
|
|
default: llvm_unreachable("Invalid SubIdx for TMA"); break;
|
2014-08-21 22:40:50 +02:00
|
|
|
}
|
2014-09-15 17:41:53 +02:00
|
|
|
|
|
|
|
case AMDGPU::FLAT_SCR:
|
|
|
|
switch (Channel) {
|
|
|
|
case 0:
|
|
|
|
return AMDGPU::FLAT_SCR_LO;
|
|
|
|
case 1:
|
|
|
|
return AMDGPU::FLAT_SCR_HI;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid SubIdx for FLAT_SCR");
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case AMDGPU::EXEC:
|
|
|
|
switch (Channel) {
|
|
|
|
case 0:
|
|
|
|
return AMDGPU::EXEC_LO;
|
|
|
|
case 1:
|
|
|
|
return AMDGPU::EXEC_HI;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid SubIdx for EXEC");
|
|
|
|
}
|
|
|
|
break;
|
2014-08-21 22:40:50 +02:00
|
|
|
}
|
|
|
|
|
2014-09-24 03:33:22 +02:00
|
|
|
const TargetRegisterClass *RC = getPhysRegClass(Reg);
|
|
|
|
// 32-bit registers don't have sub-registers, so we can just return the
|
|
|
|
// Reg. We need to have this check here, because the calculation below
|
|
|
|
// using getHWRegIndex() will fail with special 32-bit registers like
|
|
|
|
// VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0.
|
|
|
|
if (RC->getSize() == 4) {
|
|
|
|
assert(Channel == 0);
|
|
|
|
return Reg;
|
|
|
|
}
|
|
|
|
|
2014-05-02 17:41:42 +02:00
|
|
|
unsigned Index = getHWRegIndex(Reg);
|
|
|
|
return SubRC->getRegister(Index + Channel);
|
|
|
|
}
|
2014-07-02 22:53:44 +02:00
|
|
|
|
2015-01-12 20:33:18 +01:00
|
|
|
bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
|
|
|
|
return OpType == AMDGPU::OPERAND_REG_IMM32;
|
2014-07-02 22:53:44 +02:00
|
|
|
}
|
|
|
|
|
2015-01-12 20:33:18 +01:00
|
|
|
bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
|
|
|
|
if (opCanUseLiteralConstant(OpType))
|
2014-09-23 23:26:25 +02:00
|
|
|
return true;
|
|
|
|
|
2015-01-12 20:33:18 +01:00
|
|
|
return OpType == AMDGPU::OPERAND_REG_INLINE_C;
|
2014-09-23 23:26:25 +02:00
|
|
|
}
|
|
|
|
|
2015-11-30 22:16:03 +01:00
|
|
|
// FIXME: Most of these are flexible with HSA and we don't need to reserve them
|
|
|
|
// as input registers if unused. Whether the dispatch ptr is necessary should be
|
|
|
|
// easy to detect from used intrinsics. Scratch setup is harder to know.
|
2014-07-21 17:45:01 +02:00
|
|
|
unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
|
|
|
|
enum PreloadedValue Value) const {
|
|
|
|
|
|
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
2015-12-01 03:14:33 +01:00
|
|
|
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
|
|
|
|
(void)ST;
|
2014-07-21 17:45:01 +02:00
|
|
|
switch (Value) {
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::WORKGROUP_ID_X:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(MFI->hasWorkGroupIDX());
|
|
|
|
return MFI->WorkGroupIDXSystemSGPR;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::WORKGROUP_ID_Y:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(MFI->hasWorkGroupIDY());
|
|
|
|
return MFI->WorkGroupIDYSystemSGPR;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::WORKGROUP_ID_Z:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(MFI->hasWorkGroupIDZ());
|
|
|
|
return MFI->WorkGroupIDZSystemSGPR;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
|
2015-11-30 22:16:03 +01:00
|
|
|
return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations");
|
|
|
|
assert(MFI->hasPrivateSegmentBuffer());
|
|
|
|
return MFI->PrivateSegmentBufferUserSGPR;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::KERNARG_SEGMENT_PTR:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(MFI->hasKernargSegmentPtr());
|
|
|
|
return MFI->KernargSegmentPtrUserSGPR;
|
2016-02-12 07:31:30 +01:00
|
|
|
case SIRegisterInfo::DISPATCH_ID:
|
|
|
|
llvm_unreachable("unimplemented");
|
|
|
|
case SIRegisterInfo::FLAT_SCRATCH_INIT:
|
|
|
|
assert(MFI->hasFlatScratchInit());
|
|
|
|
return MFI->FlatScratchInitUserSGPR;
|
2015-11-26 01:43:29 +01:00
|
|
|
case SIRegisterInfo::DISPATCH_PTR:
|
|
|
|
assert(MFI->hasDispatchPtr());
|
2015-11-30 22:16:03 +01:00
|
|
|
return MFI->DispatchPtrUserSGPR;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::QUEUE_PTR:
|
2016-04-25 21:27:18 +02:00
|
|
|
assert(MFI->hasQueuePtr());
|
|
|
|
return MFI->QueuePtrUserSGPR;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::WORKITEM_ID_X:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(MFI->hasWorkItemIDX());
|
2014-09-22 17:35:29 +02:00
|
|
|
return AMDGPU::VGPR0;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::WORKITEM_ID_Y:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(MFI->hasWorkItemIDY());
|
2014-09-22 17:35:29 +02:00
|
|
|
return AMDGPU::VGPR1;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::WORKITEM_ID_Z:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(MFI->hasWorkItemIDZ());
|
2014-09-22 17:35:29 +02:00
|
|
|
return AMDGPU::VGPR2;
|
2014-07-21 17:45:01 +02:00
|
|
|
}
|
2014-07-21 19:52:00 +02:00
|
|
|
llvm_unreachable("unexpected preloaded value type");
|
2014-07-21 17:45:01 +02:00
|
|
|
}
|
2014-09-24 03:33:17 +02:00
|
|
|
|
|
|
|
/// \brief Returns a register that is not used at any point in the function.
|
|
|
|
/// If all registers are used, then this function will return
|
|
|
|
// AMDGPU::NoRegister.
|
2015-01-14 16:42:31 +01:00
|
|
|
unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
|
|
|
|
const TargetRegisterClass *RC) const {
|
2015-08-18 20:54:27 +02:00
|
|
|
for (unsigned Reg : *RC)
|
|
|
|
if (!MRI.isPhysRegUsed(Reg))
|
|
|
|
return Reg;
|
2014-09-24 03:33:17 +02:00
|
|
|
return AMDGPU::NoRegister;
|
|
|
|
}
|
|
|
|
|
2015-01-29 17:55:25 +01:00
|
|
|
unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
|
|
|
|
switch(WaveCount) {
|
|
|
|
case 10: return 24;
|
|
|
|
case 9: return 28;
|
|
|
|
case 8: return 32;
|
|
|
|
case 7: return 36;
|
|
|
|
case 6: return 40;
|
|
|
|
case 5: return 48;
|
|
|
|
case 4: return 64;
|
|
|
|
case 3: return 84;
|
|
|
|
case 2: return 128;
|
|
|
|
default: return 256;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-03-09 16:48:00 +01:00
|
|
|
unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
|
|
|
|
unsigned WaveCount) const {
|
|
|
|
if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
|
|
|
|
switch (WaveCount) {
|
|
|
|
case 10: return 80;
|
|
|
|
case 9: return 80;
|
|
|
|
case 8: return 96;
|
|
|
|
default: return 102;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
switch(WaveCount) {
|
|
|
|
case 10: return 48;
|
|
|
|
case 9: return 56;
|
|
|
|
case 8: return 64;
|
|
|
|
case 7: return 72;
|
|
|
|
case 6: return 80;
|
|
|
|
case 5: return 96;
|
|
|
|
default: return 103;
|
|
|
|
}
|
2015-01-29 17:55:25 +01:00
|
|
|
}
|
|
|
|
}
|
2016-04-30 02:23:06 +02:00
|
|
|
|
|
|
|
bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
|
|
|
|
unsigned Reg) const {
|
|
|
|
const TargetRegisterClass *RC;
|
|
|
|
if (TargetRegisterInfo::isVirtualRegister(Reg))
|
|
|
|
RC = MRI.getRegClass(Reg);
|
|
|
|
else
|
|
|
|
RC = getPhysRegClass(Reg);
|
|
|
|
|
|
|
|
return hasVGPRs(RC);
|
|
|
|
}
|