2012-12-11 22:25:42 +01:00
|
|
|
//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
/// \file
|
|
|
|
/// \brief SI implementation of the TargetRegisterInfo class.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "SIRegisterInfo.h"
|
2013-11-14 00:36:50 +01:00
|
|
|
#include "SIInstrInfo.h"
|
2014-07-21 17:45:01 +02:00
|
|
|
#include "SIMachineFunctionInfo.h"
|
|
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
|
|
|
#include "llvm/CodeGen/RegisterScavenging.h"
|
2014-08-21 22:40:54 +02:00
|
|
|
#include "llvm/IR/Function.h"
|
|
|
|
#include "llvm/IR/LLVMContext.h"
|
2012-12-11 22:25:42 +01:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
2016-03-23 02:53:22 +01:00
|
|
|
static bool hasPressureSet(const int *PSets, unsigned PSetID) {
|
|
|
|
for (unsigned i = 0; PSets[i] != -1; ++i) {
|
|
|
|
if (PSets[i] == (int)PSetID)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
|
|
|
|
BitVector &PressureSets) const {
|
|
|
|
for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) {
|
|
|
|
const int *PSets = getRegUnitPressureSets(*U);
|
|
|
|
if (hasPressureSet(PSets, PSetID)) {
|
|
|
|
PressureSets.set(PSetID);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(),
|
|
|
|
SGPRPressureSets(getNumRegPressureSets()),
|
|
|
|
VGPRPressureSets(getNumRegPressureSets()) {
|
2016-01-13 17:10:10 +01:00
|
|
|
unsigned NumRegPressureSets = getNumRegPressureSets();
|
|
|
|
|
|
|
|
SGPR32SetID = NumRegPressureSets;
|
|
|
|
VGPR32SetID = NumRegPressureSets;
|
|
|
|
for (unsigned i = 0; i < NumRegPressureSets; ++i) {
|
|
|
|
if (strncmp("SGPR_32", getRegPressureSetName(i), 7) == 0)
|
|
|
|
SGPR32SetID = i;
|
|
|
|
else if (strncmp("VGPR_32", getRegPressureSetName(i), 7) == 0)
|
|
|
|
VGPR32SetID = i;
|
2016-03-23 02:53:22 +01:00
|
|
|
|
|
|
|
classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
|
|
|
|
classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
|
2016-01-13 17:10:10 +01:00
|
|
|
}
|
|
|
|
assert(SGPR32SetID < NumRegPressureSets &&
|
|
|
|
VGPR32SetID < NumRegPressureSets);
|
|
|
|
}
|
2012-12-11 22:25:42 +01:00
|
|
|
|
2015-08-26 20:54:50 +02:00
|
|
|
void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
|
|
|
|
MCRegAliasIterator R(Reg, this, true);
|
2014-09-24 03:33:23 +02:00
|
|
|
|
2015-08-26 20:54:50 +02:00
|
|
|
for (; R.isValid(); ++R)
|
|
|
|
Reserved.set(*R);
|
|
|
|
}
|
2014-09-24 03:33:23 +02:00
|
|
|
|
2015-11-30 22:16:03 +01:00
|
|
|
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
|
|
|
|
const MachineFunction &MF) const {
|
|
|
|
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
|
|
|
|
if (ST.hasSGPRInitBug()) {
|
AMDGPU/SI: xnack_mask is always reserved on VI
Summary:
Somehow, I first interpreted the docs as saying space for xnack_mask is only
reserved when XNACK is enabled via SH_MEM_CONFIG. I felt uneasy about this and
went back to actually test what is happening, and it turns out that xnack_mask
is always reserved at least on Tonga and Carrizo, in the sense that flat_scr
is always fixed below the SGPRs that are used to implement xnack_mask, whether
or not they are actually used.
I confirmed this by writing a shader using inline assembly to tease out the
aliasing between flat_scratch and regular SGPRs. For example, on Tonga, where
we fix the number of SGPRs to 80, s[74:75] aliases flat_scratch (so
xnack_mask is s[76:77] and vcc is s[78:79]).
This patch changes both the calculation of the total number of SGPRs and the
various register reservations to account for this.
It ought to be possible to use the gap left by xnack_mask when the feature
isn't used, but this patch doesn't try to do that. (Note that the same applies
to vcc.)
Note that previously, even before my earlier change in r256794, the SGPRs that
alias to xnack_mask could end up being used as well when flat_scr was unused
and the total number of SGPRs happened to fall on the right alignment
(e.g. highest regular SGPR being used s29 and VCC used would lead to number
of SGPRs being 32, where s28 and s29 alias with xnack_mask). So if there
were some conflict due to such aliasing, we should have noticed that already.
Reviewers: arsenm, tstellarAMD
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D15898
llvm-svn: 257073
2016-01-07 18:10:20 +01:00
|
|
|
// Leave space for flat_scr, xnack_mask, vcc, and alignment
|
|
|
|
unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4;
|
2015-11-30 22:16:03 +01:00
|
|
|
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
|
|
|
|
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
|
AMDGPU/SI: xnack_mask is always reserved on VI
Summary:
Somehow, I first interpreted the docs as saying space for xnack_mask is only
reserved when XNACK is enabled via SH_MEM_CONFIG. I felt uneasy about this and
went back to actually test what is happening, and it turns out that xnack_mask
is always reserved at least on Tonga and Carrizo, in the sense that flat_scr
is always fixed below the SGPRs that are used to implement xnack_mask, whether
or not they are actually used.
I confirmed this by writing a shader using inline assembly to tease out the
aliasing between flat_scratch and regular SGPRs. For example, on Tonga, where
we fix the number of SGPRs to 80, s[74:75] aliases flat_scratch (so
xnack_mask is s[76:77] and vcc is s[78:79]).
This patch changes both the calculation of the total number of SGPRs and the
various register reservations to account for this.
It ought to be possible to use the gap left by xnack_mask when the feature
isn't used, but this patch doesn't try to do that. (Note that the same applies
to vcc.)
Note that previously, even before my earlier change in r256794, the SGPRs that
alias to xnack_mask could end up being used as well when flat_scr was unused
and the total number of SGPRs happened to fall on the right alignment
(e.g. highest regular SGPR being used s29 and VCC used would lead to number
of SGPRs being 32, where s28 and s29 alias with xnack_mask). So if there
were some conflict due to such aliasing, we should have noticed that already.
Reviewers: arsenm, tstellarAMD
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D15898
llvm-svn: 257073
2016-01-07 18:10:20 +01:00
|
|
|
// 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and
|
|
|
|
// 100/101 for vcc. This is the next sgpr128 down.
|
2015-11-30 22:16:03 +01:00
|
|
|
return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
|
|
|
|
}
|
|
|
|
|
|
|
|
return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
|
|
|
|
const MachineFunction &MF) const {
|
|
|
|
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
|
|
|
|
if (ST.hasSGPRInitBug()) {
|
AMDGPU/SI: xnack_mask is always reserved on VI
Summary:
Somehow, I first interpreted the docs as saying space for xnack_mask is only
reserved when XNACK is enabled via SH_MEM_CONFIG. I felt uneasy about this and
went back to actually test what is happening, and it turns out that xnack_mask
is always reserved at least on Tonga and Carrizo, in the sense that flat_scr
is always fixed below the SGPRs that are used to implement xnack_mask, whether
or not they are actually used.
I confirmed this by writing a shader using inline assembly to tease out the
aliasing between flat_scratch and regular SGPRs. For example, on Tonga, where
we fix the number of SGPRs to 80, s[74:75] aliases flat_scratch (so
xnack_mask is s[76:77] and vcc is s[78:79]).
This patch changes both the calculation of the total number of SGPRs and the
various register reservations to account for this.
It ought to be possible to use the gap left by xnack_mask when the feature
isn't used, but this patch doesn't try to do that. (Note that the same applies
to vcc.)
Note that previously, even before my earlier change in r256794, the SGPRs that
alias to xnack_mask could end up being used as well when flat_scr was unused
and the total number of SGPRs happened to fall on the right alignment
(e.g. highest regular SGPR being used s29 and VCC used would lead to number
of SGPRs being 32, where s28 and s29 alias with xnack_mask). So if there
were some conflict due to such aliasing, we should have noticed that already.
Reviewers: arsenm, tstellarAMD
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D15898
llvm-svn: 257073
2016-01-07 18:10:20 +01:00
|
|
|
unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1;
|
2015-11-30 22:16:03 +01:00
|
|
|
return AMDGPU::SGPR_32RegClass.getRegister(Idx);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
|
AMDGPU/SI: xnack_mask is always reserved on VI
Summary:
Somehow, I first interpreted the docs as saying space for xnack_mask is only
reserved when XNACK is enabled via SH_MEM_CONFIG. I felt uneasy about this and
went back to actually test what is happening, and it turns out that xnack_mask
is always reserved at least on Tonga and Carrizo, in the sense that flat_scr
is always fixed below the SGPRs that are used to implement xnack_mask, whether
or not they are actually used.
I confirmed this by writing a shader using inline assembly to tease out the
aliasing between flat_scratch and regular SGPRs. For example, on Tonga, where
we fix the number of SGPRs to 80, s[74:75] aliases flat_scratch (so
xnack_mask is s[76:77] and vcc is s[78:79]).
This patch changes both the calculation of the total number of SGPRs and the
various register reservations to account for this.
It ought to be possible to use the gap left by xnack_mask when the feature
isn't used, but this patch doesn't try to do that. (Note that the same applies
to vcc.)
Note that previously, even before my earlier change in r256794, the SGPRs that
alias to xnack_mask could end up being used as well when flat_scr was unused
and the total number of SGPRs happened to fall on the right alignment
(e.g. highest regular SGPR being used s29 and VCC used would lead to number
of SGPRs being 32, where s28 and s29 alias with xnack_mask). So if there
were some conflict due to such aliasing, we should have noticed that already.
Reviewers: arsenm, tstellarAMD
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D15898
llvm-svn: 257073
2016-01-07 18:10:20 +01:00
|
|
|
// Next register before reservations for flat_scr, xnack_mask, vcc,
|
|
|
|
// and scratch resource.
|
|
|
|
return AMDGPU::SGPR91;
|
2015-11-30 22:16:03 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return AMDGPU::SGPR95;
|
|
|
|
}
|
|
|
|
|
2015-08-26 20:54:50 +02:00
|
|
|
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
|
|
|
|
BitVector Reserved(getNumRegs());
|
2013-11-14 00:36:50 +01:00
|
|
|
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
|
2015-08-26 20:54:50 +02:00
|
|
|
|
|
|
|
// EXEC_LO and EXEC_HI could be allocated and used as regular register, but
|
|
|
|
// this seems likely to result in bugs, so I'm marking them as reserved.
|
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::EXEC);
|
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
|
2014-09-24 03:33:17 +02:00
|
|
|
|
2015-11-03 23:39:50 +01:00
|
|
|
// Reserve the last 2 registers so we will always have at least 2 more that
|
|
|
|
// will physically contain VCC.
|
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103);
|
|
|
|
|
2015-11-03 23:39:52 +01:00
|
|
|
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
|
|
|
|
|
|
|
|
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
|
|
|
|
// SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
|
AMDGPU/SI: xnack_mask is always reserved on VI
Summary:
Somehow, I first interpreted the docs as saying space for xnack_mask is only
reserved when XNACK is enabled via SH_MEM_CONFIG. I felt uneasy about this and
went back to actually test what is happening, and it turns out that xnack_mask
is always reserved at least on Tonga and Carrizo, in the sense that flat_scr
is always fixed below the SGPRs that are used to implement xnack_mask, whether
or not they are actually used.
I confirmed this by writing a shader using inline assembly to tease out the
aliasing between flat_scratch and regular SGPRs. For example, on Tonga, where
we fix the number of SGPRs to 80, s[74:75] aliases flat_scratch (so
xnack_mask is s[76:77] and vcc is s[78:79]).
This patch changes both the calculation of the total number of SGPRs and the
various register reservations to account for this.
It ought to be possible to use the gap left by xnack_mask when the feature
isn't used, but this patch doesn't try to do that. (Note that the same applies
to vcc.)
Note that previously, even before my earlier change in r256794, the SGPRs that
alias to xnack_mask could end up being used as well when flat_scr was unused
and the total number of SGPRs happened to fall on the right alignment
(e.g. highest regular SGPR being used s29 and VCC used would lead to number
of SGPRs being 32, where s28 and s29 alias with xnack_mask). So if there
were some conflict due to such aliasing, we should have noticed that already.
Reviewers: arsenm, tstellarAMD
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D15898
llvm-svn: 257073
2016-01-07 18:10:20 +01:00
|
|
|
// for VCC/XNACK_MASK/FLAT_SCR.
|
|
|
|
//
|
|
|
|
// TODO The SGPRs that alias to XNACK_MASK could be used as general purpose
|
|
|
|
// SGPRs when the XNACK feature is not used. This is currently not done
|
|
|
|
// because the code that counts SGPRs cannot account for such holes.
|
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97);
|
2015-11-03 23:39:52 +01:00
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
|
|
|
|
reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
|
|
|
|
}
|
|
|
|
|
2015-03-09 16:48:09 +01:00
|
|
|
// Tonga and Iceland can only allocate a fixed number of SGPRs due
|
|
|
|
// to a hw bug.
|
2015-11-03 23:39:52 +01:00
|
|
|
if (ST.hasSGPRInitBug()) {
|
2015-03-09 16:48:09 +01:00
|
|
|
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
|
AMDGPU/SI: xnack_mask is always reserved on VI
Summary:
Somehow, I first interpreted the docs as saying space for xnack_mask is only
reserved when XNACK is enabled via SH_MEM_CONFIG. I felt uneasy about this and
went back to actually test what is happening, and it turns out that xnack_mask
is always reserved at least on Tonga and Carrizo, in the sense that flat_scr
is always fixed below the SGPRs that are used to implement xnack_mask, whether
or not they are actually used.
I confirmed this by writing a shader using inline assembly to tease out the
aliasing between flat_scratch and regular SGPRs. For example, on Tonga, where
we fix the number of SGPRs to 80, s[74:75] aliases flat_scratch (so
xnack_mask is s[76:77] and vcc is s[78:79]).
This patch changes both the calculation of the total number of SGPRs and the
various register reservations to account for this.
It ought to be possible to use the gap left by xnack_mask when the feature
isn't used, but this patch doesn't try to do that. (Note that the same applies
to vcc.)
Note that previously, even before my earlier change in r256794, the SGPRs that
alias to xnack_mask could end up being used as well when flat_scr was unused
and the total number of SGPRs happened to fall on the right alignment
(e.g. highest regular SGPR being used s29 and VCC used would lead to number
of SGPRs being 32, where s28 and s29 alias with xnack_mask). So if there
were some conflict due to such aliasing, we should have noticed that already.
Reviewers: arsenm, tstellarAMD
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D15898
llvm-svn: 257073
2016-01-07 18:10:20 +01:00
|
|
|
// Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs).
|
|
|
|
unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6;
|
2016-01-05 00:35:53 +01:00
|
|
|
|
2015-03-09 16:48:09 +01:00
|
|
|
for (unsigned i = Limit; i < NumSGPRs; ++i) {
|
|
|
|
unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
|
2015-08-26 20:54:50 +02:00
|
|
|
reserveRegisterTuples(Reserved, Reg);
|
2015-03-09 16:48:09 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-11-30 22:15:53 +01:00
|
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
2015-11-30 22:16:03 +01:00
|
|
|
|
|
|
|
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
|
|
|
|
if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
|
|
|
|
// Reserve 1 SGPR for scratch wave offset in case we need to spill.
|
|
|
|
reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
|
|
|
|
}
|
|
|
|
|
2015-11-30 22:15:53 +01:00
|
|
|
unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
|
|
|
|
if (ScratchRSrcReg != AMDGPU::NoRegister) {
|
|
|
|
// Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
|
|
|
|
// to spill.
|
|
|
|
// TODO: May need to reserve a VGPR if doing LDS spilling.
|
|
|
|
reserveRegisterTuples(Reserved, ScratchRSrcReg);
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
|
2015-11-30 22:15:53 +01:00
|
|
|
}
|
|
|
|
|
2012-12-11 22:25:42 +01:00
|
|
|
return Reserved;
|
|
|
|
}
|
|
|
|
|
2015-03-11 19:34:58 +01:00
|
|
|
unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
|
|
|
|
unsigned Idx) const {
|
2015-03-11 19:43:21 +01:00
|
|
|
const AMDGPUSubtarget &STI = MF.getSubtarget<AMDGPUSubtarget>();
|
2015-01-29 17:55:25 +01:00
|
|
|
// FIXME: We should adjust the max number of waves based on LDS size.
|
2015-03-11 19:43:21 +01:00
|
|
|
unsigned SGPRLimit = getNumSGPRsAllowed(STI.getGeneration(),
|
|
|
|
STI.getMaxWavesPerCU());
|
|
|
|
unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU());
|
2015-01-29 17:55:25 +01:00
|
|
|
|
2015-11-12 22:43:25 +01:00
|
|
|
unsigned VSLimit = SGPRLimit + VGPRLimit;
|
|
|
|
|
2016-03-23 02:53:22 +01:00
|
|
|
if (SGPRPressureSets.test(Idx) && VGPRPressureSets.test(Idx)) {
|
|
|
|
// FIXME: This is a hack. We should never be considering the pressure of
|
|
|
|
// these since no virtual register should ever have this class.
|
|
|
|
return VSLimit;
|
2015-01-29 17:55:25 +01:00
|
|
|
}
|
2016-03-23 02:53:22 +01:00
|
|
|
|
|
|
|
if (SGPRPressureSets.test(Idx))
|
|
|
|
return SGPRLimit;
|
|
|
|
|
|
|
|
return VGPRLimit;
|
2013-03-26 15:04:02 +01:00
|
|
|
}
|
|
|
|
|
2014-07-21 17:45:01 +02:00
|
|
|
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
|
|
|
|
return Fn.getFrameInfo()->hasStackObjects();
|
|
|
|
}
|
|
|
|
|
2016-03-04 19:02:01 +01:00
|
|
|
bool
|
|
|
|
SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
|
|
|
|
return MF.getFrameInfo()->hasStackObjects();
|
|
|
|
}
|
|
|
|
|
2014-08-21 22:40:54 +02:00
|
|
|
static unsigned getNumSubRegsForSpillOp(unsigned Op) {
|
|
|
|
|
|
|
|
switch (Op) {
|
|
|
|
case AMDGPU::SI_SPILL_S512_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S512_RESTORE:
|
2014-09-24 03:33:17 +02:00
|
|
|
case AMDGPU::SI_SPILL_V512_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V512_RESTORE:
|
2014-08-21 22:40:54 +02:00
|
|
|
return 16;
|
|
|
|
case AMDGPU::SI_SPILL_S256_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S256_RESTORE:
|
2014-09-24 03:33:17 +02:00
|
|
|
case AMDGPU::SI_SPILL_V256_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V256_RESTORE:
|
2014-08-21 22:40:54 +02:00
|
|
|
return 8;
|
|
|
|
case AMDGPU::SI_SPILL_S128_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S128_RESTORE:
|
2014-09-24 03:33:17 +02:00
|
|
|
case AMDGPU::SI_SPILL_V128_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V128_RESTORE:
|
2014-08-21 22:40:54 +02:00
|
|
|
return 4;
|
2014-09-24 03:33:17 +02:00
|
|
|
case AMDGPU::SI_SPILL_V96_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V96_RESTORE:
|
|
|
|
return 3;
|
2014-08-21 22:40:54 +02:00
|
|
|
case AMDGPU::SI_SPILL_S64_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S64_RESTORE:
|
2014-09-24 03:33:17 +02:00
|
|
|
case AMDGPU::SI_SPILL_V64_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V64_RESTORE:
|
2014-08-21 22:40:54 +02:00
|
|
|
return 2;
|
|
|
|
case AMDGPU::SI_SPILL_S32_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S32_RESTORE:
|
2014-09-24 03:33:17 +02:00
|
|
|
case AMDGPU::SI_SPILL_V32_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V32_RESTORE:
|
2014-08-21 22:40:54 +02:00
|
|
|
return 1;
|
|
|
|
default: llvm_unreachable("Invalid spill opcode");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-14 16:42:31 +01:00
|
|
|
void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
|
|
|
|
unsigned LoadStoreOp,
|
|
|
|
unsigned Value,
|
2015-01-20 18:49:47 +01:00
|
|
|
unsigned ScratchRsrcReg,
|
2015-01-14 16:42:31 +01:00
|
|
|
unsigned ScratchOffset,
|
2016-03-04 19:02:01 +01:00
|
|
|
int64_t Offset) const {
|
2015-01-14 16:42:31 +01:00
|
|
|
|
|
|
|
MachineBasicBlock *MBB = MI->getParent();
|
2016-03-04 19:02:01 +01:00
|
|
|
MachineFunction *MF = MI->getParent()->getParent();
|
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
2015-03-11 19:43:21 +01:00
|
|
|
const SIInstrInfo *TII =
|
|
|
|
static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo());
|
2015-01-14 16:42:31 +01:00
|
|
|
LLVMContext &Ctx = MF->getFunction()->getContext();
|
|
|
|
DebugLoc DL = MI->getDebugLoc();
|
AMDGPU/SI: add llvm.amdgcn.buffer.load/store.format intrinsics
Summary:
They correspond to BUFFER_LOAD/STORE_FORMAT_XYZW and will be used by Mesa
to implement the GL_ARB_shader_image_load_store extension.
The intention is that for llvm.amdgcn.buffer.load.format, LLVM will decide
whether one of the _X/_XY/_XYZ opcodes can be used (similar to image sampling
and loads). However, this is not currently implemented.
For llvm.amdgcn.buffer.store, LLVM cannot decide to use one of the "smaller"
opcodes and therefore the intrinsic is overloaded. Currently, only the v4f32
is actually implemented since GLSL also only has a vec4 variant of the store
instructions, although it's conceivable that Mesa will want to be smarter
about this in the future.
BUFFER_LOAD_FORMAT_XYZW is already exposed via llvm.SI.vs.load.input, which
has a legacy name, pretends not to access memory, and does not capture the
full flexibility of the instruction.
Reviewers: arsenm, tstellarAMD, mareko
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D17277
llvm-svn: 263140
2016-03-10 19:43:50 +01:00
|
|
|
bool IsStore = TII->get(LoadStoreOp).mayStore();
|
2015-01-14 16:42:31 +01:00
|
|
|
|
|
|
|
bool RanOutOfSGPRs = false;
|
2016-02-10 21:13:58 +01:00
|
|
|
bool Scavenged = false;
|
2015-01-14 16:42:31 +01:00
|
|
|
unsigned SOffset = ScratchOffset;
|
|
|
|
|
|
|
|
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
|
|
|
|
unsigned Size = NumSubRegs * 4;
|
|
|
|
|
|
|
|
if (!isUInt<12>(Offset + Size)) {
|
2016-03-04 19:02:01 +01:00
|
|
|
SOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
|
2015-01-14 16:42:31 +01:00
|
|
|
if (SOffset == AMDGPU::NoRegister) {
|
|
|
|
RanOutOfSGPRs = true;
|
|
|
|
SOffset = AMDGPU::SGPR0;
|
2016-02-10 21:13:58 +01:00
|
|
|
} else {
|
|
|
|
Scavenged = true;
|
2015-01-14 16:42:31 +01:00
|
|
|
}
|
|
|
|
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
|
|
|
|
.addReg(ScratchOffset)
|
|
|
|
.addImm(Offset);
|
|
|
|
Offset = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (RanOutOfSGPRs)
|
|
|
|
Ctx.emitError("Ran out of SGPRs for spilling VGPRS");
|
|
|
|
|
|
|
|
for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += 4) {
|
|
|
|
unsigned SubReg = NumSubRegs > 1 ?
|
|
|
|
getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
|
|
|
|
Value;
|
|
|
|
|
2016-02-10 21:13:58 +01:00
|
|
|
unsigned SOffsetRegState = 0;
|
|
|
|
if (i + 1 == e && Scavenged)
|
|
|
|
SOffsetRegState |= RegState::Kill;
|
|
|
|
|
2015-01-14 16:42:31 +01:00
|
|
|
BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
|
AMDGPU/SI: add llvm.amdgcn.buffer.load/store.format intrinsics
Summary:
They correspond to BUFFER_LOAD/STORE_FORMAT_XYZW and will be used by Mesa
to implement the GL_ARB_shader_image_load_store extension.
The intention is that for llvm.amdgcn.buffer.load.format, LLVM will decide
whether one of the _X/_XY/_XYZ opcodes can be used (similar to image sampling
and loads). However, this is not currently implemented.
For llvm.amdgcn.buffer.store, LLVM cannot decide to use one of the "smaller"
opcodes and therefore the intrinsic is overloaded. Currently, only the v4f32
is actually implemented since GLSL also only has a vec4 variant of the store
instructions, although it's conceivable that Mesa will want to be smarter
about this in the future.
BUFFER_LOAD_FORMAT_XYZW is already exposed via llvm.SI.vs.load.input, which
has a legacy name, pretends not to access memory, and does not capture the
full flexibility of the instruction.
Reviewers: arsenm, tstellarAMD, mareko
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D17277
llvm-svn: 263140
2016-03-10 19:43:50 +01:00
|
|
|
.addReg(SubReg, getDefRegState(!IsStore))
|
2015-11-30 22:16:03 +01:00
|
|
|
.addReg(ScratchRsrcReg)
|
2016-02-10 21:13:58 +01:00
|
|
|
.addReg(SOffset, SOffsetRegState)
|
2015-08-29 08:48:57 +02:00
|
|
|
.addImm(Offset)
|
|
|
|
.addImm(0) // glc
|
|
|
|
.addImm(0) // slc
|
|
|
|
.addImm(0) // tfe
|
AMDGPU/SI: add llvm.amdgcn.buffer.load/store.format intrinsics
Summary:
They correspond to BUFFER_LOAD/STORE_FORMAT_XYZW and will be used by Mesa
to implement the GL_ARB_shader_image_load_store extension.
The intention is that for llvm.amdgcn.buffer.load.format, LLVM will decide
whether one of the _X/_XY/_XYZ opcodes can be used (similar to image sampling
and loads). However, this is not currently implemented.
For llvm.amdgcn.buffer.store, LLVM cannot decide to use one of the "smaller"
opcodes and therefore the intrinsic is overloaded. Currently, only the v4f32
is actually implemented since GLSL also only has a vec4 variant of the store
instructions, although it's conceivable that Mesa will want to be smarter
about this in the future.
BUFFER_LOAD_FORMAT_XYZW is already exposed via llvm.SI.vs.load.input, which
has a legacy name, pretends not to access memory, and does not capture the
full flexibility of the instruction.
Reviewers: arsenm, tstellarAMD, mareko
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D17277
llvm-svn: 263140
2016-03-10 19:43:50 +01:00
|
|
|
.addReg(Value, RegState::Implicit | getDefRegState(!IsStore))
|
2015-08-29 08:48:57 +02:00
|
|
|
.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
|
2015-01-14 16:42:31 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-21 17:45:01 +02:00
|
|
|
void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
|
|
|
|
int SPAdj, unsigned FIOperandNum,
|
|
|
|
RegScavenger *RS) const {
|
|
|
|
MachineFunction *MF = MI->getParent()->getParent();
|
2016-03-04 19:02:01 +01:00
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
2014-08-21 22:40:54 +02:00
|
|
|
MachineBasicBlock *MBB = MI->getParent();
|
|
|
|
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
|
2014-07-21 17:45:01 +02:00
|
|
|
MachineFrameInfo *FrameInfo = MF->getFrameInfo();
|
2015-03-11 19:43:21 +01:00
|
|
|
const SIInstrInfo *TII =
|
|
|
|
static_cast<const SIInstrInfo *>(MF->getSubtarget().getInstrInfo());
|
2014-08-21 22:40:54 +02:00
|
|
|
DebugLoc DL = MI->getDebugLoc();
|
|
|
|
|
2014-07-21 17:45:01 +02:00
|
|
|
MachineOperand &FIOp = MI->getOperand(FIOperandNum);
|
|
|
|
int Index = MI->getOperand(FIOperandNum).getIndex();
|
2014-08-21 22:40:54 +02:00
|
|
|
|
|
|
|
switch (MI->getOpcode()) {
|
|
|
|
// SGPR register spill
|
|
|
|
case AMDGPU::SI_SPILL_S512_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S256_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S128_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S64_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_S32_SAVE: {
|
|
|
|
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
|
2016-03-04 19:31:18 +01:00
|
|
|
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
2014-08-21 22:40:54 +02:00
|
|
|
|
|
|
|
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
|
|
|
|
unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
|
|
|
|
&AMDGPU::SGPR_32RegClass, i);
|
|
|
|
struct SIMachineFunctionInfo::SpilledReg Spill =
|
|
|
|
MFI->getSpilledReg(MF, Index, i);
|
|
|
|
|
2016-03-04 19:31:18 +01:00
|
|
|
if (Spill.hasReg()) {
|
|
|
|
BuildMI(*MBB, MI, DL,
|
|
|
|
TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
|
|
|
|
Spill.VGPR)
|
|
|
|
.addReg(SubReg)
|
|
|
|
.addImm(Spill.Lane);
|
|
|
|
|
|
|
|
// FIXME: Since this spills to another register instead of an actual
|
|
|
|
// frame index, we should delete the frame index when all references to
|
|
|
|
// it are fixed.
|
|
|
|
} else {
|
|
|
|
// Spill SGPR to a frame index.
|
|
|
|
// FIXME we should use S_STORE_DWORD here for VI.
|
|
|
|
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
|
|
|
|
.addReg(SubReg);
|
|
|
|
|
|
|
|
unsigned Size = FrameInfo->getObjectSize(Index);
|
|
|
|
unsigned Align = FrameInfo->getObjectAlignment(Index);
|
|
|
|
MachinePointerInfo PtrInfo
|
|
|
|
= MachinePointerInfo::getFixedStack(*MF, Index);
|
|
|
|
MachineMemOperand *MMO
|
|
|
|
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
|
|
|
|
Size, Align);
|
|
|
|
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
|
|
|
|
.addReg(TmpReg) // src
|
|
|
|
.addFrameIndex(Index) // frame_idx
|
|
|
|
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
|
|
|
|
.addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
|
|
|
|
.addImm(i * 4) // offset
|
|
|
|
.addMemOperand(MMO);
|
|
|
|
}
|
2014-08-21 22:40:54 +02:00
|
|
|
}
|
|
|
|
MI->eraseFromParent();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// SGPR register restore
|
|
|
|
case AMDGPU::SI_SPILL_S512_RESTORE:
|
|
|
|
case AMDGPU::SI_SPILL_S256_RESTORE:
|
|
|
|
case AMDGPU::SI_SPILL_S128_RESTORE:
|
|
|
|
case AMDGPU::SI_SPILL_S64_RESTORE:
|
|
|
|
case AMDGPU::SI_SPILL_S32_RESTORE: {
|
|
|
|
unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
|
2016-03-04 19:31:18 +01:00
|
|
|
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
2014-08-21 22:40:54 +02:00
|
|
|
|
|
|
|
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
|
|
|
|
unsigned SubReg = getPhysRegSubReg(MI->getOperand(0).getReg(),
|
|
|
|
&AMDGPU::SGPR_32RegClass, i);
|
|
|
|
struct SIMachineFunctionInfo::SpilledReg Spill =
|
|
|
|
MFI->getSpilledReg(MF, Index, i);
|
|
|
|
|
2016-03-04 19:31:18 +01:00
|
|
|
if (Spill.hasReg()) {
|
|
|
|
BuildMI(*MBB, MI, DL,
|
|
|
|
TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
|
|
|
|
SubReg)
|
|
|
|
.addReg(Spill.VGPR)
|
|
|
|
.addImm(Spill.Lane)
|
|
|
|
.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
|
|
|
|
} else {
|
|
|
|
// Restore SGPR from a stack slot.
|
|
|
|
// FIXME: We should use S_LOAD_DWORD here for VI.
|
|
|
|
|
|
|
|
unsigned Align = FrameInfo->getObjectAlignment(Index);
|
|
|
|
unsigned Size = FrameInfo->getObjectSize(Index);
|
|
|
|
|
|
|
|
MachinePointerInfo PtrInfo
|
|
|
|
= MachinePointerInfo::getFixedStack(*MF, Index);
|
|
|
|
|
|
|
|
MachineMemOperand *MMO = MF->getMachineMemOperand(
|
|
|
|
PtrInfo, MachineMemOperand::MOLoad, Size, Align);
|
|
|
|
|
|
|
|
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
|
|
|
|
.addFrameIndex(Index) // frame_idx
|
|
|
|
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
|
|
|
|
.addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
|
|
|
|
.addImm(i * 4) // offset
|
|
|
|
.addMemOperand(MMO);
|
|
|
|
BuildMI(*MBB, MI, DL,
|
|
|
|
TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg)
|
|
|
|
.addReg(TmpReg)
|
|
|
|
.addImm(0)
|
|
|
|
.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
|
|
|
|
}
|
2014-08-21 22:40:54 +02:00
|
|
|
}
|
2015-03-24 14:40:38 +01:00
|
|
|
|
|
|
|
// TODO: only do this when it is needed
|
|
|
|
switch (MF->getSubtarget<AMDGPUSubtarget>().getGeneration()) {
|
|
|
|
case AMDGPUSubtarget::SOUTHERN_ISLANDS:
|
2015-12-17 17:46:42 +01:00
|
|
|
// "VALU writes SGPR" -> "SMRD reads that SGPR" needs 4 wait states
|
|
|
|
// ("S_NOP 3") on SI
|
|
|
|
TII->insertWaitStates(MI, 4);
|
2015-03-24 14:40:38 +01:00
|
|
|
break;
|
|
|
|
case AMDGPUSubtarget::SEA_ISLANDS:
|
|
|
|
break;
|
|
|
|
default: // VOLCANIC_ISLANDS and later
|
2015-12-17 17:46:42 +01:00
|
|
|
// "VALU writes SGPR -> VMEM reads that SGPR" needs 5 wait states
|
|
|
|
// ("S_NOP 4") on VI and later. This also applies to VALUs which write
|
|
|
|
// VCC, but we're unlikely to see VMEM use VCC.
|
|
|
|
TII->insertWaitStates(MI, 5);
|
2015-03-24 14:40:38 +01:00
|
|
|
}
|
|
|
|
|
2014-08-21 22:40:54 +02:00
|
|
|
MI->eraseFromParent();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-09-24 03:33:17 +02:00
|
|
|
// VGPR register spill
|
|
|
|
case AMDGPU::SI_SPILL_V512_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V256_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V128_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V96_SAVE:
|
|
|
|
case AMDGPU::SI_SPILL_V64_SAVE:
|
2015-01-14 16:42:31 +01:00
|
|
|
case AMDGPU::SI_SPILL_V32_SAVE:
|
|
|
|
buildScratchLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
|
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::src)->getReg(),
|
2015-01-20 18:49:47 +01:00
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
|
2015-01-14 16:42:31 +01:00
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
|
2016-03-04 19:31:18 +01:00
|
|
|
FrameInfo->getObjectOffset(Index) +
|
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm());
|
2014-09-24 03:33:17 +02:00
|
|
|
MI->eraseFromParent();
|
|
|
|
break;
|
|
|
|
case AMDGPU::SI_SPILL_V32_RESTORE:
|
|
|
|
case AMDGPU::SI_SPILL_V64_RESTORE:
|
2015-01-30 22:51:51 +01:00
|
|
|
case AMDGPU::SI_SPILL_V96_RESTORE:
|
2014-09-24 03:33:17 +02:00
|
|
|
case AMDGPU::SI_SPILL_V128_RESTORE:
|
|
|
|
case AMDGPU::SI_SPILL_V256_RESTORE:
|
|
|
|
case AMDGPU::SI_SPILL_V512_RESTORE: {
|
2015-01-14 16:42:31 +01:00
|
|
|
buildScratchLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
|
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::dst)->getReg(),
|
2015-01-20 18:49:47 +01:00
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_rsrc)->getReg(),
|
2015-01-14 16:42:31 +01:00
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::scratch_offset)->getReg(),
|
2016-03-04 19:31:18 +01:00
|
|
|
FrameInfo->getObjectOffset(Index) +
|
|
|
|
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm());
|
2014-09-24 03:33:17 +02:00
|
|
|
MI->eraseFromParent();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-08-21 22:40:54 +02:00
|
|
|
default: {
|
|
|
|
int64_t Offset = FrameInfo->getObjectOffset(Index);
|
|
|
|
FIOp.ChangeToImmediate(Offset);
|
|
|
|
if (!TII->isImmOperandLegal(MI, FIOperandNum, FIOp)) {
|
2016-03-04 19:02:01 +01:00
|
|
|
unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
2014-08-21 22:40:54 +02:00
|
|
|
BuildMI(*MBB, MI, MI->getDebugLoc(),
|
|
|
|
TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
|
|
|
|
.addImm(Offset);
|
2015-01-20 18:49:45 +01:00
|
|
|
FIOp.ChangeToRegister(TmpReg, false, false, true);
|
2014-08-21 22:40:54 +02:00
|
|
|
}
|
|
|
|
}
|
2014-07-21 17:45:01 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-14 00:36:50 +01:00
|
|
|
unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
|
2014-03-31 16:01:52 +02:00
|
|
|
return getEncodingValue(Reg) & 0xff;
|
2013-11-14 00:36:50 +01:00
|
|
|
}
|
|
|
|
|
2015-10-01 23:43:15 +02:00
|
|
|
// FIXME: This is very slow. It might be worth creating a map from physreg to
|
|
|
|
// register class.
|
2013-08-07 01:08:18 +02:00
|
|
|
const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
|
|
|
|
assert(!TargetRegisterInfo::isVirtualRegister(Reg));
|
|
|
|
|
2015-10-18 07:15:34 +02:00
|
|
|
static const TargetRegisterClass *const BaseClasses[] = {
|
2015-01-07 21:59:25 +01:00
|
|
|
&AMDGPU::VGPR_32RegClass,
|
2013-08-07 01:08:18 +02:00
|
|
|
&AMDGPU::SReg_32RegClass,
|
|
|
|
&AMDGPU::VReg_64RegClass,
|
|
|
|
&AMDGPU::SReg_64RegClass,
|
2014-09-24 03:33:17 +02:00
|
|
|
&AMDGPU::VReg_96RegClass,
|
|
|
|
&AMDGPU::VReg_128RegClass,
|
2013-08-07 01:08:18 +02:00
|
|
|
&AMDGPU::SReg_128RegClass,
|
2014-09-24 03:33:17 +02:00
|
|
|
&AMDGPU::VReg_256RegClass,
|
|
|
|
&AMDGPU::SReg_256RegClass,
|
2015-08-14 21:46:05 +02:00
|
|
|
&AMDGPU::VReg_512RegClass,
|
2016-02-13 00:45:29 +01:00
|
|
|
&AMDGPU::SReg_512RegClass,
|
|
|
|
&AMDGPU::SCC_CLASSRegClass,
|
2013-08-07 01:08:18 +02:00
|
|
|
};
|
|
|
|
|
2014-05-12 21:23:21 +02:00
|
|
|
for (const TargetRegisterClass *BaseClass : BaseClasses) {
|
|
|
|
if (BaseClass->contains(Reg)) {
|
|
|
|
return BaseClass;
|
2013-08-07 01:08:18 +02:00
|
|
|
}
|
|
|
|
}
|
2014-04-25 07:30:21 +02:00
|
|
|
return nullptr;
|
2013-08-07 01:08:18 +02:00
|
|
|
}
|
2013-08-15 01:24:24 +02:00
|
|
|
|
2015-09-26 06:59:04 +02:00
|
|
|
// TODO: It might be helpful to have some target specific flags in
|
|
|
|
// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
|
2013-11-14 00:36:37 +01:00
|
|
|
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
|
2015-09-26 06:59:04 +02:00
|
|
|
switch (RC->getSize()) {
|
2016-02-13 00:45:29 +01:00
|
|
|
case 0: return false;
|
|
|
|
case 1: return false;
|
2015-09-26 06:59:04 +02:00
|
|
|
case 4:
|
|
|
|
return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
|
|
|
|
case 8:
|
|
|
|
return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
|
|
|
|
case 12:
|
|
|
|
return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
|
|
|
|
case 16:
|
|
|
|
return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
|
|
|
|
case 32:
|
|
|
|
return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
|
|
|
|
case 64:
|
|
|
|
return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid register class size");
|
|
|
|
}
|
2013-11-14 00:36:37 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
|
|
|
|
const TargetRegisterClass *SRC) const {
|
2015-09-26 06:59:04 +02:00
|
|
|
switch (SRC->getSize()) {
|
|
|
|
case 4:
|
|
|
|
return &AMDGPU::VGPR_32RegClass;
|
|
|
|
case 8:
|
|
|
|
return &AMDGPU::VReg_64RegClass;
|
|
|
|
case 12:
|
|
|
|
return &AMDGPU::VReg_96RegClass;
|
|
|
|
case 16:
|
|
|
|
return &AMDGPU::VReg_128RegClass;
|
|
|
|
case 32:
|
|
|
|
return &AMDGPU::VReg_256RegClass;
|
|
|
|
case 64:
|
|
|
|
return &AMDGPU::VReg_512RegClass;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid register class size");
|
|
|
|
}
|
2013-11-14 00:36:37 +01:00
|
|
|
}
|
|
|
|
|
2016-02-11 22:45:07 +01:00
|
|
|
const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
|
|
|
|
const TargetRegisterClass *VRC) const {
|
|
|
|
switch (VRC->getSize()) {
|
|
|
|
case 4:
|
|
|
|
return &AMDGPU::SGPR_32RegClass;
|
|
|
|
case 8:
|
|
|
|
return &AMDGPU::SReg_64RegClass;
|
|
|
|
case 16:
|
|
|
|
return &AMDGPU::SReg_128RegClass;
|
|
|
|
case 32:
|
|
|
|
return &AMDGPU::SReg_256RegClass;
|
|
|
|
case 64:
|
|
|
|
return &AMDGPU::SReg_512RegClass;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid register class size");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-14 00:36:37 +01:00
|
|
|
const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
|
|
|
|
const TargetRegisterClass *RC, unsigned SubIdx) const {
|
|
|
|
if (SubIdx == AMDGPU::NoSubRegister)
|
|
|
|
return RC;
|
|
|
|
|
2016-01-07 18:10:29 +01:00
|
|
|
// We can assume that each lane corresponds to one 32-bit register.
|
|
|
|
unsigned Count = countPopulation(getSubRegIndexLaneMask(SubIdx));
|
2013-11-14 00:36:37 +01:00
|
|
|
if (isSGPRClass(RC)) {
|
2016-01-07 18:10:29 +01:00
|
|
|
switch (Count) {
|
|
|
|
case 1:
|
|
|
|
return &AMDGPU::SGPR_32RegClass;
|
|
|
|
case 2:
|
|
|
|
return &AMDGPU::SReg_64RegClass;
|
|
|
|
case 4:
|
|
|
|
return &AMDGPU::SReg_128RegClass;
|
|
|
|
case 8:
|
|
|
|
return &AMDGPU::SReg_256RegClass;
|
|
|
|
case 16: /* fall-through */
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid sub-register class size");
|
|
|
|
}
|
2013-11-14 00:36:37 +01:00
|
|
|
} else {
|
2016-01-07 18:10:29 +01:00
|
|
|
switch (Count) {
|
|
|
|
case 1:
|
|
|
|
return &AMDGPU::VGPR_32RegClass;
|
|
|
|
case 2:
|
|
|
|
return &AMDGPU::VReg_64RegClass;
|
|
|
|
case 3:
|
|
|
|
return &AMDGPU::VReg_96RegClass;
|
|
|
|
case 4:
|
|
|
|
return &AMDGPU::VReg_128RegClass;
|
|
|
|
case 8:
|
|
|
|
return &AMDGPU::VReg_256RegClass;
|
|
|
|
case 16: /* fall-through */
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid sub-register class size");
|
|
|
|
}
|
2013-11-14 00:36:37 +01:00
|
|
|
}
|
2013-08-15 01:24:24 +02:00
|
|
|
}
|
2014-05-02 17:41:42 +02:00
|
|
|
|
2015-09-24 10:36:14 +02:00
|
|
|
bool SIRegisterInfo::shouldRewriteCopySrc(
|
|
|
|
const TargetRegisterClass *DefRC,
|
|
|
|
unsigned DefSubReg,
|
|
|
|
const TargetRegisterClass *SrcRC,
|
|
|
|
unsigned SrcSubReg) const {
|
|
|
|
// We want to prefer the smallest register class possible, so we don't want to
|
|
|
|
// stop and rewrite on anything that looks like a subregister
|
|
|
|
// extract. Operations mostly don't care about the super register class, so we
|
|
|
|
// only want to stop on the most basic of copies between the smae register
|
|
|
|
// class.
|
|
|
|
//
|
|
|
|
// e.g. if we have something like
|
|
|
|
// vreg0 = ...
|
|
|
|
// vreg1 = ...
|
|
|
|
// vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2
|
|
|
|
// vreg3 = COPY vreg2, sub0
|
|
|
|
//
|
|
|
|
// We want to look through the COPY to find:
|
|
|
|
// => vreg3 = COPY vreg0
|
|
|
|
|
|
|
|
// Plain copy.
|
|
|
|
return getCommonSubClass(DefRC, SrcRC) != nullptr;
|
|
|
|
}
|
|
|
|
|
2014-05-02 17:41:42 +02:00
|
|
|
unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
|
|
|
|
const TargetRegisterClass *SubRC,
|
|
|
|
unsigned Channel) const {
|
2014-08-21 22:40:50 +02:00
|
|
|
|
|
|
|
switch (Reg) {
|
|
|
|
case AMDGPU::VCC:
|
|
|
|
switch(Channel) {
|
|
|
|
case 0: return AMDGPU::VCC_LO;
|
|
|
|
case 1: return AMDGPU::VCC_HI;
|
|
|
|
default: llvm_unreachable("Invalid SubIdx for VCC");
|
|
|
|
}
|
2014-09-15 17:41:53 +02:00
|
|
|
|
|
|
|
case AMDGPU::FLAT_SCR:
|
|
|
|
switch (Channel) {
|
|
|
|
case 0:
|
|
|
|
return AMDGPU::FLAT_SCR_LO;
|
|
|
|
case 1:
|
|
|
|
return AMDGPU::FLAT_SCR_HI;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid SubIdx for FLAT_SCR");
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case AMDGPU::EXEC:
|
|
|
|
switch (Channel) {
|
|
|
|
case 0:
|
|
|
|
return AMDGPU::EXEC_LO;
|
|
|
|
case 1:
|
|
|
|
return AMDGPU::EXEC_HI;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Invalid SubIdx for EXEC");
|
|
|
|
}
|
|
|
|
break;
|
2014-08-21 22:40:50 +02:00
|
|
|
}
|
|
|
|
|
2014-09-24 03:33:22 +02:00
|
|
|
const TargetRegisterClass *RC = getPhysRegClass(Reg);
|
|
|
|
// 32-bit registers don't have sub-registers, so we can just return the
|
|
|
|
// Reg. We need to have this check here, because the calculation below
|
|
|
|
// using getHWRegIndex() will fail with special 32-bit registers like
|
|
|
|
// VCC_LO, VCC_HI, EXEC_LO, EXEC_HI and M0.
|
|
|
|
if (RC->getSize() == 4) {
|
|
|
|
assert(Channel == 0);
|
|
|
|
return Reg;
|
|
|
|
}
|
|
|
|
|
2014-05-02 17:41:42 +02:00
|
|
|
unsigned Index = getHWRegIndex(Reg);
|
|
|
|
return SubRC->getRegister(Index + Channel);
|
|
|
|
}
|
2014-07-02 22:53:44 +02:00
|
|
|
|
2015-01-12 20:33:18 +01:00
|
|
|
bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
|
|
|
|
return OpType == AMDGPU::OPERAND_REG_IMM32;
|
2014-07-02 22:53:44 +02:00
|
|
|
}
|
|
|
|
|
2015-01-12 20:33:18 +01:00
|
|
|
bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
|
|
|
|
if (opCanUseLiteralConstant(OpType))
|
2014-09-23 23:26:25 +02:00
|
|
|
return true;
|
|
|
|
|
2015-01-12 20:33:18 +01:00
|
|
|
return OpType == AMDGPU::OPERAND_REG_INLINE_C;
|
2014-09-23 23:26:25 +02:00
|
|
|
}
|
|
|
|
|
2015-11-30 22:16:03 +01:00
|
|
|
// FIXME: Most of these are flexible with HSA and we don't need to reserve them
|
|
|
|
// as input registers if unused. Whether the dispatch ptr is necessary should be
|
|
|
|
// easy to detect from used intrinsics. Scratch setup is harder to know.
|
2014-07-21 17:45:01 +02:00
|
|
|
unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
|
|
|
|
enum PreloadedValue Value) const {
|
|
|
|
|
|
|
|
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
2015-12-01 03:14:33 +01:00
|
|
|
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
|
|
|
|
(void)ST;
|
2014-07-21 17:45:01 +02:00
|
|
|
switch (Value) {
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::WORKGROUP_ID_X:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(MFI->hasWorkGroupIDX());
|
|
|
|
return MFI->WorkGroupIDXSystemSGPR;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::WORKGROUP_ID_Y:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(MFI->hasWorkGroupIDY());
|
|
|
|
return MFI->WorkGroupIDYSystemSGPR;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::WORKGROUP_ID_Z:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(MFI->hasWorkGroupIDZ());
|
|
|
|
return MFI->WorkGroupIDZSystemSGPR;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
|
2015-11-30 22:16:03 +01:00
|
|
|
return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations");
|
|
|
|
assert(MFI->hasPrivateSegmentBuffer());
|
|
|
|
return MFI->PrivateSegmentBufferUserSGPR;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::KERNARG_SEGMENT_PTR:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(MFI->hasKernargSegmentPtr());
|
|
|
|
return MFI->KernargSegmentPtrUserSGPR;
|
2016-02-12 07:31:30 +01:00
|
|
|
case SIRegisterInfo::DISPATCH_ID:
|
|
|
|
llvm_unreachable("unimplemented");
|
|
|
|
case SIRegisterInfo::FLAT_SCRATCH_INIT:
|
|
|
|
assert(MFI->hasFlatScratchInit());
|
|
|
|
return MFI->FlatScratchInitUserSGPR;
|
2015-11-26 01:43:29 +01:00
|
|
|
case SIRegisterInfo::DISPATCH_PTR:
|
|
|
|
assert(MFI->hasDispatchPtr());
|
2015-11-30 22:16:03 +01:00
|
|
|
return MFI->DispatchPtrUserSGPR;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::QUEUE_PTR:
|
|
|
|
llvm_unreachable("not implemented");
|
|
|
|
case SIRegisterInfo::WORKITEM_ID_X:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(MFI->hasWorkItemIDX());
|
2014-09-22 17:35:29 +02:00
|
|
|
return AMDGPU::VGPR0;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::WORKITEM_ID_Y:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(MFI->hasWorkItemIDY());
|
2014-09-22 17:35:29 +02:00
|
|
|
return AMDGPU::VGPR1;
|
2015-11-30 22:15:57 +01:00
|
|
|
case SIRegisterInfo::WORKITEM_ID_Z:
|
2015-11-30 22:16:03 +01:00
|
|
|
assert(MFI->hasWorkItemIDZ());
|
2014-09-22 17:35:29 +02:00
|
|
|
return AMDGPU::VGPR2;
|
2014-07-21 17:45:01 +02:00
|
|
|
}
|
2014-07-21 19:52:00 +02:00
|
|
|
llvm_unreachable("unexpected preloaded value type");
|
2014-07-21 17:45:01 +02:00
|
|
|
}
|
2014-09-24 03:33:17 +02:00
|
|
|
|
|
|
|
/// \brief Returns a register that is not used at any point in the function.
|
|
|
|
/// If all registers are used, then this function will return
|
|
|
|
// AMDGPU::NoRegister.
|
2015-01-14 16:42:31 +01:00
|
|
|
unsigned SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
|
|
|
|
const TargetRegisterClass *RC) const {
|
2015-08-18 20:54:27 +02:00
|
|
|
for (unsigned Reg : *RC)
|
|
|
|
if (!MRI.isPhysRegUsed(Reg))
|
|
|
|
return Reg;
|
2014-09-24 03:33:17 +02:00
|
|
|
return AMDGPU::NoRegister;
|
|
|
|
}
|
|
|
|
|
2015-01-29 17:55:25 +01:00
|
|
|
unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
|
|
|
|
switch(WaveCount) {
|
|
|
|
case 10: return 24;
|
|
|
|
case 9: return 28;
|
|
|
|
case 8: return 32;
|
|
|
|
case 7: return 36;
|
|
|
|
case 6: return 40;
|
|
|
|
case 5: return 48;
|
|
|
|
case 4: return 64;
|
|
|
|
case 3: return 84;
|
|
|
|
case 2: return 128;
|
|
|
|
default: return 256;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-03-09 16:48:00 +01:00
|
|
|
unsigned SIRegisterInfo::getNumSGPRsAllowed(AMDGPUSubtarget::Generation gen,
|
|
|
|
unsigned WaveCount) const {
|
|
|
|
if (gen >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
|
|
|
|
switch (WaveCount) {
|
|
|
|
case 10: return 80;
|
|
|
|
case 9: return 80;
|
|
|
|
case 8: return 96;
|
|
|
|
default: return 102;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
switch(WaveCount) {
|
|
|
|
case 10: return 48;
|
|
|
|
case 9: return 56;
|
|
|
|
case 8: return 64;
|
|
|
|
case 7: return 72;
|
|
|
|
case 6: return 80;
|
|
|
|
case 5: return 96;
|
|
|
|
default: return 103;
|
|
|
|
}
|
2015-01-29 17:55:25 +01:00
|
|
|
}
|
|
|
|
}
|