mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 04:32:44 +01:00
[AMDGPU] Wave and register controls
- Implemented amdgpu-flat-work-group-size attribute - Implemented amdgpu-num-active-waves-per-eu attribute - Implemented amdgpu-num-sgpr attribute - Implemented amdgpu-num-vgpr attribute - Dynamic LDS constraints are in a separate patch Patch by Tom Stellard and Konstantin Zhuravlyov Differential Revision: https://reviews.llvm.org/D21562 llvm-svn: 280747
This commit is contained in:
parent
66de9e0d16
commit
0da0753352
@ -202,6 +202,16 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
|
||||
OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
|
||||
" bytes/workgroup (compile time only)", false);
|
||||
|
||||
OutStreamer->emitRawComment(" SGPRBlocks: " +
|
||||
Twine(KernelInfo.SGPRBlocks), false);
|
||||
OutStreamer->emitRawComment(" VGPRBlocks: " +
|
||||
Twine(KernelInfo.VGPRBlocks), false);
|
||||
|
||||
OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " +
|
||||
Twine(KernelInfo.NumSGPRsForWavesPerEU), false);
|
||||
OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " +
|
||||
Twine(KernelInfo.NumVGPRsForWavesPerEU), false);
|
||||
|
||||
OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
|
||||
false);
|
||||
OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
|
||||
@ -446,20 +456,15 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
|
||||
ExtraSGPRs = 6;
|
||||
}
|
||||
|
||||
MaxSGPR += ExtraSGPRs;
|
||||
|
||||
// Record first reserved register and reserved register count fields, and
|
||||
// update max register counts if "amdgpu-debugger-reserve-regs" attribute was
|
||||
// specified.
|
||||
if (STM.debuggerReserveRegs()) {
|
||||
ProgInfo.ReservedVGPRFirst = MaxVGPR + 1;
|
||||
ProgInfo.ReservedVGPRCount = MFI->getDebuggerReservedVGPRCount();
|
||||
MaxVGPR += MFI->getDebuggerReservedVGPRCount();
|
||||
}
|
||||
// requested.
|
||||
ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0;
|
||||
ProgInfo.ReservedVGPRCount = RI->getNumDebuggerReservedVGPRs(STM);
|
||||
|
||||
// Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
|
||||
// DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
|
||||
// attribute was specified.
|
||||
// attribute was requested.
|
||||
if (STM.debuggerEmitPrologue()) {
|
||||
ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
|
||||
RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
|
||||
@ -467,11 +472,22 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
|
||||
RI->getHWRegIndex(MFI->getScratchRSrcReg());
|
||||
}
|
||||
|
||||
// Account for extra SGPRs and VGPRs reserved for debugger use.
|
||||
MaxSGPR += ExtraSGPRs;
|
||||
MaxVGPR += RI->getNumDebuggerReservedVGPRs(STM);
|
||||
|
||||
// We found the maximum register index. They start at 0, so add one to get the
|
||||
// number of registers.
|
||||
ProgInfo.NumVGPR = MaxVGPR + 1;
|
||||
ProgInfo.NumSGPR = MaxSGPR + 1;
|
||||
|
||||
// Adjust number of registers used to meet default/requested minimum/maximum
|
||||
// number of waves per execution unit request.
|
||||
ProgInfo.NumSGPRsForWavesPerEU = std::max(
|
||||
ProgInfo.NumSGPR, RI->getMinNumSGPRs(STM, MFI->getMaxWavesPerEU()));
|
||||
ProgInfo.NumVGPRsForWavesPerEU = std::max(
|
||||
ProgInfo.NumVGPR, RI->getMinNumVGPRs(MFI->getMaxWavesPerEU()));
|
||||
|
||||
if (STM.hasSGPRInitBug()) {
|
||||
if (ProgInfo.NumSGPR > SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG) {
|
||||
LLVMContext &Ctx = MF.getFunction()->getContext();
|
||||
@ -482,6 +498,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
|
||||
}
|
||||
|
||||
ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
|
||||
ProgInfo.NumSGPRsForWavesPerEU = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
|
||||
}
|
||||
|
||||
if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
|
||||
@ -498,8 +515,16 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
|
||||
Ctx.diagnose(Diag);
|
||||
}
|
||||
|
||||
ProgInfo.VGPRBlocks = (ProgInfo.NumVGPR - 1) / 4;
|
||||
ProgInfo.SGPRBlocks = (ProgInfo.NumSGPR - 1) / 8;
|
||||
// SGPRBlocks is actual number of SGPR blocks minus 1.
|
||||
ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU,
|
||||
RI->getSGPRAllocGranule());
|
||||
ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / RI->getSGPRAllocGranule() - 1;
|
||||
|
||||
// VGPRBlocks is actual number of VGPR blocks minus 1.
|
||||
ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU,
|
||||
RI->getVGPRAllocGranule());
|
||||
ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / RI->getVGPRAllocGranule() - 1;
|
||||
|
||||
// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
|
||||
// register.
|
||||
ProgInfo.FloatMode = getFPMode(MF);
|
||||
@ -525,8 +550,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
|
||||
LDSAlignShift = 9;
|
||||
}
|
||||
|
||||
unsigned LDSSpillSize = MFI->LDSWaveSpillSize *
|
||||
MFI->getMaximumWorkGroupSize(MF);
|
||||
unsigned LDSSpillSize =
|
||||
MFI->LDSWaveSpillSize * MFI->getMaxFlatWorkGroupSize();
|
||||
|
||||
ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
|
||||
ProgInfo.LDSBlocks =
|
||||
|
@ -40,6 +40,8 @@ private:
|
||||
NumVGPR(0),
|
||||
NumSGPR(0),
|
||||
FlatUsed(false),
|
||||
NumSGPRsForWavesPerEU(0),
|
||||
NumVGPRsForWavesPerEU(0),
|
||||
ReservedVGPRFirst(0),
|
||||
ReservedVGPRCount(0),
|
||||
DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1),
|
||||
@ -71,15 +73,23 @@ private:
|
||||
uint32_t LDSSize;
|
||||
bool FlatUsed;
|
||||
|
||||
// Number of SGPRs that meets number of waves per execution unit request.
|
||||
uint32_t NumSGPRsForWavesPerEU;
|
||||
|
||||
// Number of VGPRs that meets number of waves per execution unit request.
|
||||
uint32_t NumVGPRsForWavesPerEU;
|
||||
|
||||
// If ReservedVGPRCount is 0 then must be 0. Otherwise, this is the first
|
||||
// fixed VGPR number reserved.
|
||||
uint16_t ReservedVGPRFirst;
|
||||
|
||||
// The number of consecutive VGPRs reserved.
|
||||
uint16_t ReservedVGPRCount;
|
||||
|
||||
// Fixed SGPR number used to hold wave scratch offset for entire kernel
|
||||
// execution, or uint16_t(-1) if the register is not used or not known.
|
||||
uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR;
|
||||
|
||||
// Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire
|
||||
// kernel execution, or uint16_t(-1) if the register is not used or not
|
||||
// known.
|
||||
|
@ -184,13 +184,12 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
|
||||
|
||||
// TODO: Have some sort of hint or other heuristics to guess occupancy based
|
||||
// on other factors..
|
||||
unsigned OccupancyHint
|
||||
= AMDGPU::getIntegerAttribute(F, "amdgpu-max-waves-per-eu", 0);
|
||||
unsigned OccupancyHint = ST.getWavesPerEU(F).second;
|
||||
if (OccupancyHint == 0)
|
||||
OccupancyHint = 7;
|
||||
|
||||
// Clamp to max value.
|
||||
OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerCU());
|
||||
OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
|
||||
|
||||
// Check the hint but ignore it if it's obviously wrong from the existing LDS
|
||||
// usage.
|
||||
@ -650,9 +649,11 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
|
||||
if (AMDGPU::isShader(ContainingFunction.getCallingConv()))
|
||||
return;
|
||||
|
||||
const AMDGPUSubtarget &ST =
|
||||
TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
|
||||
// FIXME: We should also try to get this value from the reqd_work_group_size
|
||||
// function attribute if it is available.
|
||||
unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);
|
||||
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
|
||||
|
||||
const DataLayout &DL = Mod->getDataLayout();
|
||||
|
||||
|
@ -179,6 +179,88 @@ unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
|
||||
const Function &F) const {
|
||||
|
||||
// Default minimum/maximum flat work group sizes.
|
||||
std::pair<unsigned, unsigned> Default =
|
||||
AMDGPU::isCompute(F.getCallingConv()) ?
|
||||
std::pair<unsigned, unsigned>(getWavefrontSize() * 2,
|
||||
getWavefrontSize() * 4) :
|
||||
std::pair<unsigned, unsigned>(1, getWavefrontSize());
|
||||
|
||||
// TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
|
||||
// starts using "amdgpu-flat-work-group-size" attribute.
|
||||
Default.second = AMDGPU::getIntegerAttribute(
|
||||
F, "amdgpu-max-work-group-size", Default.second);
|
||||
Default.first = std::min(Default.first, Default.second);
|
||||
|
||||
// Requested minimum/maximum flat work group sizes.
|
||||
std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
|
||||
F, "amdgpu-flat-work-group-size", Default);
|
||||
|
||||
// Make sure requested minimum is less than requested maximum.
|
||||
if (Requested.first > Requested.second)
|
||||
return Default;
|
||||
|
||||
// Make sure requested values do not violate subtarget's specifications.
|
||||
if (Requested.first < getMinFlatWorkGroupSize())
|
||||
return Default;
|
||||
if (Requested.second > getMaxFlatWorkGroupSize())
|
||||
return Default;
|
||||
|
||||
return Requested;
|
||||
}
|
||||
|
||||
std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
|
||||
const Function &F) const {
|
||||
|
||||
// Default minimum/maximum number of waves per execution unit.
|
||||
std::pair<unsigned, unsigned> Default(1, 0);
|
||||
|
||||
// Default/requested minimum/maximum flat work group sizes.
|
||||
std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
|
||||
|
||||
// If minimum/maximum flat work group sizes were explicitly requested using
|
||||
// "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
|
||||
// number of waves per execution unit to values implied by requested
|
||||
// minimum/maximum flat work group sizes.
|
||||
unsigned MinImpliedByFlatWorkGroupSize =
|
||||
getMaxWavesPerEU(FlatWorkGroupSizes.second);
|
||||
bool RequestedFlatWorkGroupSize = false;
|
||||
|
||||
// TODO: Do not process "amdgpu-max-work-group-size" attribute once mesa
|
||||
// starts using "amdgpu-flat-work-group-size" attribute.
|
||||
if (F.hasFnAttribute("amdgpu-max-work-group-size") ||
|
||||
F.hasFnAttribute("amdgpu-flat-work-group-size")) {
|
||||
Default.first = MinImpliedByFlatWorkGroupSize;
|
||||
RequestedFlatWorkGroupSize = true;
|
||||
}
|
||||
|
||||
// Requested minimum/maximum number of waves per execution unit.
|
||||
std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
|
||||
F, "amdgpu-waves-per-eu", Default, true);
|
||||
|
||||
// Make sure requested minimum is less than requested maximum.
|
||||
if (Requested.second && Requested.first > Requested.second)
|
||||
return Default;
|
||||
|
||||
// Make sure requested values do not violate subtarget's specifications.
|
||||
if (Requested.first < getMinWavesPerEU() ||
|
||||
Requested.first > getMaxWavesPerEU())
|
||||
return Default;
|
||||
if (Requested.second > getMaxWavesPerEU())
|
||||
return Default;
|
||||
|
||||
// Make sure requested values are compatible with values implied by requested
|
||||
// minimum/maximum flat work group sizes.
|
||||
if (RequestedFlatWorkGroupSize &&
|
||||
Requested.first > MinImpliedByFlatWorkGroupSize)
|
||||
return Default;
|
||||
|
||||
return Requested;
|
||||
}
|
||||
|
||||
R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
|
||||
const TargetMachine &TM) :
|
||||
AMDGPUSubtarget(TT, GPU, FS, TM),
|
||||
|
@ -270,14 +270,6 @@ public:
|
||||
return EnableXNACK;
|
||||
}
|
||||
|
||||
unsigned getMaxWavesPerCU() const {
|
||||
if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)
|
||||
return 10;
|
||||
|
||||
// FIXME: Not sure what this is for other subtagets.
|
||||
return 8;
|
||||
}
|
||||
|
||||
/// \brief Returns the offset in bytes from the start of the input buffer
|
||||
/// of the first explicit kernel argument.
|
||||
unsigned getExplicitKernelArgOffset() const {
|
||||
@ -296,6 +288,89 @@ public:
|
||||
bool enableSubRegLiveness() const override {
|
||||
return true;
|
||||
}
|
||||
|
||||
/// \returns Number of execution units per compute unit supported by the
|
||||
/// subtarget.
|
||||
unsigned getEUsPerCU() const {
|
||||
return 4;
|
||||
}
|
||||
|
||||
/// \returns Maximum number of work groups per compute unit supported by the
|
||||
/// subtarget and limited by given flat work group size.
|
||||
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
|
||||
if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
|
||||
return 8;
|
||||
return getWavesPerWorkGroup(FlatWorkGroupSize) == 1 ? 40 : 16;
|
||||
}
|
||||
|
||||
/// \returns Maximum number of waves per compute unit supported by the
|
||||
/// subtarget without any kind of limitation.
|
||||
unsigned getMaxWavesPerCU() const {
|
||||
return getMaxWavesPerEU() * getEUsPerCU();
|
||||
}
|
||||
|
||||
/// \returns Maximum number of waves per compute unit supported by the
|
||||
/// subtarget and limited by given flat work group size.
|
||||
unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
|
||||
return getWavesPerWorkGroup(FlatWorkGroupSize);
|
||||
}
|
||||
|
||||
/// \returns Minimum number of waves per execution unit supported by the
|
||||
/// subtarget.
|
||||
unsigned getMinWavesPerEU() const {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/// \returns Maximum number of waves per execution unit supported by the
|
||||
/// subtarget without any kind of limitation.
|
||||
unsigned getMaxWavesPerEU() const {
|
||||
if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
|
||||
return 8;
|
||||
// FIXME: Need to take scratch memory into account.
|
||||
return 10;
|
||||
}
|
||||
|
||||
/// \returns Maximum number of waves per execution unit supported by the
|
||||
/// subtarget and limited by given flat work group size.
|
||||
unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
|
||||
return alignTo(getMaxWavesPerCU(FlatWorkGroupSize), getEUsPerCU()) /
|
||||
getEUsPerCU();
|
||||
}
|
||||
|
||||
/// \returns Minimum flat work group size supported by the subtarget.
|
||||
unsigned getMinFlatWorkGroupSize() const {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/// \returns Maximum flat work group size supported by the subtarget.
|
||||
unsigned getMaxFlatWorkGroupSize() const {
|
||||
return 2048;
|
||||
}
|
||||
|
||||
/// \returns Number of waves per work group given the flat work group size.
|
||||
unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
|
||||
return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize();
|
||||
}
|
||||
|
||||
/// \returns Subtarget's default pair of minimum/maximum flat work group sizes
|
||||
/// for function \p F, or minimum/maximum flat work group sizes explicitly
|
||||
/// requested using "amdgpu-flat-work-group-size" attribute attached to
|
||||
/// function \p F.
|
||||
///
|
||||
/// \returns Subtarget's default values if explicitly requested values cannot
|
||||
/// be converted to integer, or violate subtarget's specifications.
|
||||
std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
|
||||
|
||||
/// \returns Subtarget's default pair of minimum/maximum number of waves per
|
||||
/// execution unit for function \p F, or minimum/maximum number of waves per
|
||||
/// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
|
||||
/// attached to function \p F.
|
||||
///
|
||||
/// \returns Subtarget's default values if explicitly requested values cannot
|
||||
/// be converted to integer, violate subtarget's specifications, or are not
|
||||
/// compatible with minimum/maximum number of waves limited by flat work group
|
||||
/// size, register usage, and/or lds usage.
|
||||
std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
|
||||
};
|
||||
|
||||
class R600Subtarget final : public AMDGPUSubtarget {
|
||||
|
@ -144,8 +144,8 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
|
||||
unsigned VGPRExcessLimit =
|
||||
Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
|
||||
unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, DAG->MF);
|
||||
unsigned SGPRCriticalLimit = SRI->getNumSGPRsAllowed(ST, MaxWaves);
|
||||
unsigned VGPRCriticalLimit = SRI->getNumVGPRsAllowed(MaxWaves);
|
||||
unsigned SGPRCriticalLimit = SRI->getMaxNumSGPRs(ST, MaxWaves);
|
||||
unsigned VGPRCriticalLimit = SRI->getMaxNumVGPRs(MaxWaves);
|
||||
|
||||
ReadyQueue &Q = Zone.Available;
|
||||
for (SUnit *SU : Q) {
|
||||
|
@ -27,14 +27,16 @@ static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
|
||||
(!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects());
|
||||
}
|
||||
|
||||
static ArrayRef<MCPhysReg> getAllSGPR128() {
|
||||
static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF,
|
||||
const SIRegisterInfo *TRI) {
|
||||
return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
|
||||
AMDGPU::SGPR_128RegClass.getNumRegs());
|
||||
TRI->getMaxNumSGPRs(MF) / 4);
|
||||
}
|
||||
|
||||
static ArrayRef<MCPhysReg> getAllSGPRs() {
|
||||
static ArrayRef<MCPhysReg> getAllSGPRs(const MachineFunction &MF,
|
||||
const SIRegisterInfo *TRI) {
|
||||
return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
|
||||
AMDGPU::SGPR_32RegClass.getNumRegs());
|
||||
TRI->getMaxNumSGPRs(MF));
|
||||
}
|
||||
|
||||
void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII,
|
||||
@ -117,7 +119,7 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
|
||||
unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
|
||||
// Skip the last 2 elements because the last one is reserved for VCC, and
|
||||
// this is the 2nd to last element already.
|
||||
for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
|
||||
for (MCPhysReg Reg : getAllSGPR128(MF, TRI).drop_back(2).slice(NumPreloaded)) {
|
||||
// Pick the first unallocated one. Make sure we don't clobber the other
|
||||
// reserved input we needed.
|
||||
if (!MRI.isPhysRegUsed(Reg)) {
|
||||
@ -159,7 +161,7 @@ unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
|
||||
// are no other free SGPRs, then the value will stay in this register.
|
||||
// ----
|
||||
// 13
|
||||
for (MCPhysReg Reg : getAllSGPRs().drop_back(13).slice(NumPreloaded)) {
|
||||
for (MCPhysReg Reg : getAllSGPRs(MF, TRI).drop_back(13).slice(NumPreloaded)) {
|
||||
// Pick the first unallocated SGPR. Be careful not to pick an alias of the
|
||||
// scratch descriptor, since we haven’t added its uses yet.
|
||||
if (!MRI.isPhysRegUsed(Reg)) {
|
||||
|
@ -728,7 +728,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
|
||||
const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
|
||||
const SIRegisterInfo *TRI = ST.getRegisterInfo();
|
||||
DebugLoc DL = MBB.findDebugLoc(MI);
|
||||
unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF);
|
||||
unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
|
||||
unsigned WavefrontSize = ST.getWavefrontSize();
|
||||
|
||||
unsigned TIDReg = MFI->getTIDReg();
|
||||
|
@ -48,8 +48,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
||||
PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
|
||||
PSInputAddr(0),
|
||||
ReturnsVoid(true),
|
||||
MaximumWorkGroupSize(0),
|
||||
DebuggerReservedVGPRCount(0),
|
||||
FlatWorkGroupSizes(0, 0),
|
||||
WavesPerEU(0, 0),
|
||||
DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}),
|
||||
DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}),
|
||||
LDSWaveSpillSize(0),
|
||||
@ -135,13 +135,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
||||
ST.isAmdHsaOS())
|
||||
FlatScratchInit = true;
|
||||
|
||||
if (AMDGPU::isCompute(F->getCallingConv()))
|
||||
MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F);
|
||||
else
|
||||
MaximumWorkGroupSize = ST.getWavefrontSize();
|
||||
|
||||
if (ST.debuggerReserveRegs())
|
||||
DebuggerReservedVGPRCount = 4;
|
||||
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
|
||||
WavesPerEU = ST.getWavesPerEU(*F);
|
||||
}
|
||||
|
||||
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
|
||||
@ -229,8 +224,3 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
|
||||
Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
|
||||
return Spill;
|
||||
}
|
||||
|
||||
unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
|
||||
const MachineFunction &MF) const {
|
||||
return MaximumWorkGroupSize;
|
||||
}
|
||||
|
@ -60,10 +60,14 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
|
||||
unsigned PSInputAddr;
|
||||
bool ReturnsVoid;
|
||||
|
||||
unsigned MaximumWorkGroupSize;
|
||||
// A pair of default/requested minimum/maximum flat work group sizes.
|
||||
// Minimum - first, maximum - second.
|
||||
std::pair<unsigned, unsigned> FlatWorkGroupSizes;
|
||||
|
||||
// A pair of default/requested minimum/maximum number of waves per execution
|
||||
// unit. Minimum - first, maximum - second.
|
||||
std::pair<unsigned, unsigned> WavesPerEU;
|
||||
|
||||
// Number of reserved VGPRs for debugger usage.
|
||||
unsigned DebuggerReservedVGPRCount;
|
||||
// Stack object indices for work group IDs.
|
||||
std::array<int, 3> DebuggerWorkGroupIDStackObjectIndices;
|
||||
// Stack object indices for work item IDs.
|
||||
@ -343,9 +347,36 @@ public:
|
||||
ReturnsVoid = Value;
|
||||
}
|
||||
|
||||
/// \returns Number of reserved VGPRs for debugger usage.
|
||||
unsigned getDebuggerReservedVGPRCount() const {
|
||||
return DebuggerReservedVGPRCount;
|
||||
/// \returns A pair of default/requested minimum/maximum flat work group sizes
|
||||
/// for this function.
|
||||
std::pair<unsigned, unsigned> getFlatWorkGroupSizes() const {
|
||||
return FlatWorkGroupSizes;
|
||||
}
|
||||
|
||||
/// \returns Default/requested minimum flat work group size for this function.
|
||||
unsigned getMinFlatWorkGroupSize() const {
|
||||
return FlatWorkGroupSizes.first;
|
||||
}
|
||||
|
||||
/// \returns Default/requested maximum flat work group size for this function.
|
||||
unsigned getMaxFlatWorkGroupSize() const {
|
||||
return FlatWorkGroupSizes.second;
|
||||
}
|
||||
|
||||
/// \returns A pair of default/requested minimum/maximum number of waves per
|
||||
/// execution unit.
|
||||
std::pair<unsigned, unsigned> getWavesPerEU() const {
|
||||
return WavesPerEU;
|
||||
}
|
||||
|
||||
/// \returns Default/requested minimum number of waves per execution unit.
|
||||
unsigned getMinWavesPerEU() const {
|
||||
return WavesPerEU.first;
|
||||
}
|
||||
|
||||
/// \returns Default/requested maximum number of waves per execution unit.
|
||||
unsigned getMaxWavesPerEU() const {
|
||||
return WavesPerEU.second;
|
||||
}
|
||||
|
||||
/// \returns Stack object index for \p Dim's work group ID.
|
||||
@ -403,8 +434,6 @@ public:
|
||||
}
|
||||
llvm_unreachable("unexpected dimension");
|
||||
}
|
||||
|
||||
unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const;
|
||||
};
|
||||
|
||||
} // End namespace llvm
|
||||
|
@ -24,53 +24,6 @@
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) {
|
||||
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
unsigned SIMDPerCU = 4;
|
||||
|
||||
unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize();
|
||||
return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) /
|
||||
MaxInvocationsPerWave;
|
||||
}
|
||||
|
||||
static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) {
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
|
||||
|
||||
unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment;
|
||||
unsigned ReservedSGPRCount;
|
||||
|
||||
if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
|
||||
TotalSGPRCountPerSIMD = 800;
|
||||
AddressableSGPRCount = 102;
|
||||
SGPRUsageAlignment = 16;
|
||||
ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK
|
||||
} else {
|
||||
TotalSGPRCountPerSIMD = 512;
|
||||
AddressableSGPRCount = 104;
|
||||
SGPRUsageAlignment = 8;
|
||||
ReservedSGPRCount = 2; // VCC
|
||||
}
|
||||
|
||||
unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD);
|
||||
MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment);
|
||||
|
||||
if (ST.hasSGPRInitBug())
|
||||
MaxSGPRCount = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
|
||||
|
||||
return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount);
|
||||
}
|
||||
|
||||
static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) {
|
||||
unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
|
||||
unsigned TotalVGPRCountPerSIMD = 256;
|
||||
unsigned VGPRUsageAlignment = 4;
|
||||
|
||||
return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD,
|
||||
VGPRUsageAlignment);
|
||||
}
|
||||
|
||||
static bool hasPressureSet(const int *PSets, unsigned PSetID) {
|
||||
for (unsigned i = 0; PSets[i] != -1; ++i) {
|
||||
if (PSets[i] == (int)PSetID)
|
||||
@ -138,14 +91,14 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
|
||||
|
||||
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
|
||||
const MachineFunction &MF) const {
|
||||
unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4;
|
||||
unsigned BaseIdx = alignDown(getMaxNumSGPRs(MF), 4) - 4;
|
||||
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
|
||||
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
|
||||
const MachineFunction &MF) const {
|
||||
unsigned RegCount = getMaxWorkGroupSGPRCount(MF);
|
||||
unsigned RegCount = getMaxNumSGPRs(MF);
|
||||
unsigned Reg;
|
||||
|
||||
// Try to place it in a hole after PrivateSegmentbufferReg.
|
||||
@ -180,18 +133,16 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
|
||||
reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
|
||||
reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
|
||||
|
||||
unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF);
|
||||
unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF);
|
||||
|
||||
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
|
||||
unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
|
||||
for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) {
|
||||
unsigned MaxNumSGPRs = getMaxNumSGPRs(MF);
|
||||
unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
|
||||
for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
|
||||
unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
|
||||
reserveRegisterTuples(Reserved, Reg);
|
||||
}
|
||||
|
||||
|
||||
for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) {
|
||||
unsigned MaxNumVGPRs = getMaxNumVGPRs(MF);
|
||||
unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
|
||||
for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
|
||||
unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
|
||||
reserveRegisterTuples(Reserved, Reg);
|
||||
}
|
||||
@ -213,48 +164,9 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
|
||||
assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
|
||||
}
|
||||
|
||||
// Reserve registers for debugger usage if "amdgpu-debugger-reserve-trap-regs"
|
||||
// attribute was specified.
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
if (ST.debuggerReserveRegs()) {
|
||||
unsigned ReservedVGPRFirst =
|
||||
MaxWorkGroupVGPRCount - MFI->getDebuggerReservedVGPRCount();
|
||||
for (unsigned i = ReservedVGPRFirst; i < MaxWorkGroupVGPRCount; ++i) {
|
||||
unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
|
||||
reserveRegisterTuples(Reserved, Reg);
|
||||
}
|
||||
}
|
||||
|
||||
return Reserved;
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
|
||||
unsigned Idx) const {
|
||||
const SISubtarget &STI = MF.getSubtarget<SISubtarget>();
|
||||
// FIXME: We should adjust the max number of waves based on LDS size.
|
||||
unsigned SGPRLimit = getNumSGPRsAllowed(STI, STI.getMaxWavesPerCU());
|
||||
unsigned VGPRLimit = getNumVGPRsAllowed(STI.getMaxWavesPerCU());
|
||||
|
||||
unsigned VSLimit = SGPRLimit + VGPRLimit;
|
||||
|
||||
if (SGPRPressureSets.test(Idx) && VGPRPressureSets.test(Idx)) {
|
||||
// FIXME: This is a hack. We should never be considering the pressure of
|
||||
// these since no virtual register should ever have this class.
|
||||
return VSLimit;
|
||||
}
|
||||
|
||||
if (SGPRPressureSets.test(Idx))
|
||||
return SGPRLimit;
|
||||
|
||||
return VGPRLimit;
|
||||
}
|
||||
|
||||
unsigned
|
||||
SIRegisterInfo::getDefaultRegPressureSetLimit(const MachineFunction &MF,
|
||||
unsigned Idx) const {
|
||||
return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx);
|
||||
}
|
||||
|
||||
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
|
||||
return Fn.getFrameInfo().hasStackObjects();
|
||||
}
|
||||
@ -956,43 +868,6 @@ SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
|
||||
return AMDGPU::NoRegister;
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::getNumVGPRsAllowed(unsigned WaveCount) const {
|
||||
switch(WaveCount) {
|
||||
case 10: return 24;
|
||||
case 9: return 28;
|
||||
case 8: return 32;
|
||||
case 7: return 36;
|
||||
case 6: return 40;
|
||||
case 5: return 48;
|
||||
case 4: return 64;
|
||||
case 3: return 84;
|
||||
case 2: return 128;
|
||||
default: return 256;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::getNumSGPRsAllowed(const SISubtarget &ST,
|
||||
unsigned WaveCount) const {
|
||||
if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
|
||||
switch (WaveCount) {
|
||||
case 10: return 80;
|
||||
case 9: return 80;
|
||||
case 8: return 96;
|
||||
default: return 102;
|
||||
}
|
||||
} else {
|
||||
switch(WaveCount) {
|
||||
case 10: return 48;
|
||||
case 9: return 56;
|
||||
case 8: return 64;
|
||||
case 7: return 72;
|
||||
case 6: return 80;
|
||||
case 5: return 96;
|
||||
default: return 103;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
|
||||
unsigned Reg) const {
|
||||
const TargetRegisterClass *RC;
|
||||
@ -1003,3 +878,183 @@ bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
|
||||
|
||||
return hasVGPRs(RC);
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::getTotalNumSGPRs(const SISubtarget &ST) const {
|
||||
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
|
||||
return 800;
|
||||
return 512;
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::getNumAddressableSGPRs(const SISubtarget &ST) const {
|
||||
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
|
||||
return 102;
|
||||
return 104;
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST) const {
|
||||
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
|
||||
return 6; // VCC, FLAT_SCRATCH, XNACK.
|
||||
return 2; // VCC.
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::getMinNumSGPRs(const SISubtarget &ST,
|
||||
unsigned WavesPerEU) const {
|
||||
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
|
||||
switch (WavesPerEU) {
|
||||
case 0: return 0;
|
||||
case 10: return 0;
|
||||
case 9: return 0;
|
||||
case 8: return 81;
|
||||
default: return 97;
|
||||
}
|
||||
} else {
|
||||
switch (WavesPerEU) {
|
||||
case 0: return 0;
|
||||
case 10: return 0;
|
||||
case 9: return 49;
|
||||
case 8: return 57;
|
||||
case 7: return 65;
|
||||
case 6: return 73;
|
||||
case 5: return 81;
|
||||
default: return 97;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::getMaxNumSGPRs(const SISubtarget &ST,
|
||||
unsigned WavesPerEU) const {
|
||||
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
|
||||
switch (WavesPerEU) {
|
||||
case 0: return 80;
|
||||
case 10: return 80;
|
||||
case 9: return 80;
|
||||
case 8: return 96;
|
||||
default: return getNumAddressableSGPRs(ST);
|
||||
}
|
||||
} else {
|
||||
switch (WavesPerEU) {
|
||||
case 0: return 48;
|
||||
case 10: return 48;
|
||||
case 9: return 56;
|
||||
case 8: return 64;
|
||||
case 7: return 72;
|
||||
case 6: return 80;
|
||||
case 5: return 96;
|
||||
default: return getNumAddressableSGPRs(ST);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::getMaxNumSGPRs(const MachineFunction &MF) const {
|
||||
const Function &F = *MF.getFunction();
|
||||
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
|
||||
|
||||
// Compute maximum number of SGPRs function can use using default/requested
|
||||
// minimum number of waves per execution unit.
|
||||
std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
|
||||
unsigned MaxNumSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first);
|
||||
|
||||
// Check if maximum number of SGPRs was explicitly requested using
|
||||
// "amdgpu-num-sgpr" attribute.
|
||||
if (F.hasFnAttribute("amdgpu-num-sgpr")) {
|
||||
unsigned Requested = AMDGPU::getIntegerAttribute(
|
||||
F, "amdgpu-num-sgpr", MaxNumSGPRs);
|
||||
|
||||
// Make sure requested value does not violate subtarget's specifications.
|
||||
if (Requested && Requested <= getNumReservedSGPRs(ST))
|
||||
Requested = 0;
|
||||
|
||||
// Make sure requested value is compatible with values implied by
|
||||
// default/requested minimum/maximum number of waves per execution unit.
|
||||
if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first))
|
||||
Requested = 0;
|
||||
if (WavesPerEU.second &&
|
||||
Requested && Requested < getMinNumSGPRs(ST, WavesPerEU.second))
|
||||
Requested = 0;
|
||||
|
||||
if (Requested)
|
||||
MaxNumSGPRs = Requested;
|
||||
}
|
||||
|
||||
if (ST.hasSGPRInitBug())
|
||||
MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
|
||||
|
||||
return MaxNumSGPRs - getNumReservedSGPRs(ST);
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::getNumDebuggerReservedVGPRs(
|
||||
const SISubtarget &ST) const {
|
||||
if (ST.debuggerReserveRegs())
|
||||
return 4;
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::getMinNumVGPRs(unsigned WavesPerEU) const {
|
||||
switch (WavesPerEU) {
|
||||
case 0: return 0;
|
||||
case 10: return 0;
|
||||
case 9: return 25;
|
||||
case 8: return 29;
|
||||
case 7: return 33;
|
||||
case 6: return 37;
|
||||
case 5: return 41;
|
||||
case 4: return 49;
|
||||
case 3: return 65;
|
||||
case 2: return 85;
|
||||
default: return 129;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::getMaxNumVGPRs(unsigned WavesPerEU) const {
|
||||
switch (WavesPerEU) {
|
||||
case 0: return 24;
|
||||
case 10: return 24;
|
||||
case 9: return 28;
|
||||
case 8: return 32;
|
||||
case 7: return 36;
|
||||
case 6: return 40;
|
||||
case 5: return 48;
|
||||
case 4: return 64;
|
||||
case 3: return 84;
|
||||
case 2: return 128;
|
||||
default: return getTotalNumVGPRs();
|
||||
}
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::getMaxNumVGPRs(const MachineFunction &MF) const {
|
||||
const Function &F = *MF.getFunction();
|
||||
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
|
||||
|
||||
// Compute maximum number of VGPRs function can use using default/requested
|
||||
// minimum number of waves per execution unit.
|
||||
std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
|
||||
unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
|
||||
|
||||
// Check if maximum number of VGPRs was explicitly requested using
|
||||
// "amdgpu-num-vgpr" attribute.
|
||||
if (F.hasFnAttribute("amdgpu-num-vgpr")) {
|
||||
unsigned Requested = AMDGPU::getIntegerAttribute(
|
||||
F, "amdgpu-num-vgpr", MaxNumVGPRs);
|
||||
|
||||
// Make sure requested value does not violate subtarget's specifications.
|
||||
if (Requested && Requested <= getNumDebuggerReservedVGPRs(ST))
|
||||
Requested = 0;
|
||||
|
||||
// Make sure requested value is compatible with values implied by
|
||||
// default/requested minimum/maximum number of waves per execution unit.
|
||||
if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
|
||||
Requested = 0;
|
||||
if (WavesPerEU.second &&
|
||||
Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
|
||||
Requested = 0;
|
||||
|
||||
if (Requested)
|
||||
MaxNumVGPRs = Requested;
|
||||
}
|
||||
|
||||
return MaxNumVGPRs - getNumDebuggerReservedVGPRs(ST);
|
||||
}
|
||||
|
@ -48,12 +48,6 @@ public:
|
||||
|
||||
BitVector getReservedRegs(const MachineFunction &MF) const override;
|
||||
|
||||
unsigned getRegPressureSetLimit(const MachineFunction &MF,
|
||||
unsigned Idx) const override;
|
||||
|
||||
unsigned getDefaultRegPressureSetLimit(const MachineFunction &MF,
|
||||
unsigned Idx) const;
|
||||
|
||||
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
|
||||
|
||||
|
||||
@ -172,14 +166,6 @@ public:
|
||||
unsigned getPreloadedValue(const MachineFunction &MF,
|
||||
enum PreloadedValue Value) const;
|
||||
|
||||
/// \brief Give the maximum number of VGPRs that can be used by \p WaveCount
|
||||
/// concurrent waves.
|
||||
unsigned getNumVGPRsAllowed(unsigned WaveCount) const;
|
||||
|
||||
/// \brief Give the maximum number of SGPRs that can be used by \p WaveCount
|
||||
/// concurrent waves.
|
||||
unsigned getNumSGPRsAllowed(const SISubtarget &ST, unsigned WaveCount) const;
|
||||
|
||||
unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
|
||||
const TargetRegisterClass *RC,
|
||||
const MachineFunction &MF) const;
|
||||
@ -196,6 +182,70 @@ public:
|
||||
return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID);
|
||||
}
|
||||
|
||||
/// \returns SGPR allocation granularity supported by the subtarget.
|
||||
unsigned getSGPRAllocGranule() const {
|
||||
return 8;
|
||||
}
|
||||
|
||||
/// \returns Total number of SGPRs supported by the subtarget.
|
||||
unsigned getTotalNumSGPRs(const SISubtarget &ST) const;
|
||||
|
||||
/// \returns Number of addressable SGPRs supported by the subtarget.
|
||||
unsigned getNumAddressableSGPRs(const SISubtarget &ST) const;
|
||||
|
||||
/// \returns Number of reserved SGPRs supported by the subtarget.
|
||||
unsigned getNumReservedSGPRs(const SISubtarget &ST) const;
|
||||
|
||||
/// \returns Minimum number of SGPRs that meets given number of waves per
|
||||
/// execution unit requirement for given subtarget.
|
||||
unsigned getMinNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU) const;
|
||||
|
||||
/// \returns Maximum number of SGPRs that meets given number of waves per
|
||||
/// execution unit requirement for given subtarget.
|
||||
unsigned getMaxNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU) const;
|
||||
|
||||
/// \returns Maximum number of SGPRs that meets number of waves per execution
|
||||
/// unit requirement for function \p MF, or number of SGPRs explicitly
|
||||
/// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
|
||||
///
|
||||
/// \returns Value that meets number of waves per execution unit requirement
|
||||
/// if explicitly requested value cannot be converted to integer, violates
|
||||
/// subtarget's specifications, or does not meet number of waves per execution
|
||||
/// unit requirement.
|
||||
unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
|
||||
|
||||
/// \returns VGPR allocation granularity supported by the subtarget.
|
||||
unsigned getVGPRAllocGranule() const {
|
||||
return 4;
|
||||
}
|
||||
|
||||
/// \returns Total number of VGPRs supported by the subtarget.
|
||||
unsigned getTotalNumVGPRs() const {
|
||||
return 256;
|
||||
}
|
||||
|
||||
/// \returns Number of reserved VGPRs for debugger use supported by the
|
||||
/// subtarget.
|
||||
unsigned getNumDebuggerReservedVGPRs(const SISubtarget &ST) const;
|
||||
|
||||
/// \returns Minimum number of SGPRs that meets given number of waves per
|
||||
/// execution unit requirement.
|
||||
unsigned getMinNumVGPRs(unsigned WavesPerEU) const;
|
||||
|
||||
/// \returns Maximum number of VGPRs that meets given number of waves per
|
||||
/// execution unit requirement.
|
||||
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const;
|
||||
|
||||
/// \returns Maximum number of VGPRs that meets number of waves per execution
|
||||
/// unit requirement for function \p MF, or number of VGPRs explicitly
|
||||
/// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
|
||||
///
|
||||
/// \returns Value that meets number of waves per execution unit requirement
|
||||
/// if explicitly requested value cannot be converted to integer, violates
|
||||
/// subtarget's specifications, or does not meet number of waves per execution
|
||||
/// unit requirement.
|
||||
unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
|
||||
|
||||
private:
|
||||
void buildScratchLoadStore(MachineBasicBlock::iterator MI,
|
||||
unsigned LoadStoreOp, const MachineOperand *SrcDst,
|
||||
|
@ -124,8 +124,29 @@ int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
|
||||
return Result;
|
||||
}
|
||||
|
||||
unsigned getMaximumWorkGroupSize(const Function &F) {
|
||||
return getIntegerAttribute(F, "amdgpu-max-work-group-size", 256);
|
||||
std::pair<int, int> getIntegerPairAttribute(const Function &F,
|
||||
StringRef Name,
|
||||
std::pair<int, int> Default,
|
||||
bool OnlyFirstRequired) {
|
||||
Attribute A = F.getFnAttribute(Name);
|
||||
if (!A.isStringAttribute())
|
||||
return Default;
|
||||
|
||||
LLVMContext &Ctx = F.getContext();
|
||||
std::pair<int, int> Ints = Default;
|
||||
std::pair<StringRef, StringRef> Strs = A.getValueAsString().split(',');
|
||||
if (Strs.first.trim().getAsInteger(0, Ints.first)) {
|
||||
Ctx.emitError("can't parse first integer attribute " + Name);
|
||||
return Default;
|
||||
}
|
||||
if (Strs.second.trim().getAsInteger(0, Ints.second)) {
|
||||
if (!OnlyFirstRequired || Strs.second.trim().size()) {
|
||||
Ctx.emitError("can't parse second integer attribute " + Name);
|
||||
return Default;
|
||||
}
|
||||
}
|
||||
|
||||
return Ints;
|
||||
}
|
||||
|
||||
unsigned getInitialPSInputAddr(const Function &F) {
|
||||
|
@ -45,9 +45,28 @@ bool isGroupSegment(const GlobalValue *GV);
|
||||
bool isGlobalSegment(const GlobalValue *GV);
|
||||
bool isReadOnlySegment(const GlobalValue *GV);
|
||||
|
||||
/// \returns Integer value requested using \p F's \p Name attribute.
|
||||
///
|
||||
/// \returns \p Default if attribute is not present.
|
||||
///
|
||||
/// \returns \p Default and emits error if requested value cannot be converted
|
||||
/// to integer.
|
||||
int getIntegerAttribute(const Function &F, StringRef Name, int Default);
|
||||
|
||||
unsigned getMaximumWorkGroupSize(const Function &F);
|
||||
/// \returns A pair of integer values requested using \p F's \p Name attribute
|
||||
/// in "first[,second]" format ("second" is optional unless \p OnlyFirstRequired
|
||||
/// is false).
|
||||
///
|
||||
/// \returns \p Default if attribute is not present.
|
||||
///
|
||||
/// \returns \p Default and emits error if one of the requested values cannot be
|
||||
/// converted to integer, or \p OnlyFirstRequired is false and "second" value is
|
||||
/// not present.
|
||||
std::pair<int, int> getIntegerPairAttribute(const Function &F,
|
||||
StringRef Name,
|
||||
std::pair<int, int> Default,
|
||||
bool OnlyFirstRequired = false);
|
||||
|
||||
unsigned getInitialPSInputAddr(const Function &F);
|
||||
|
||||
bool isShader(CallingConv::ID cc);
|
||||
|
@ -545,7 +545,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" }
|
||||
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
|
||||
|
||||
; HSAOPT: !0 = !{}
|
||||
; HSAOPT: !1 = !{i32 0, i32 2048}
|
||||
|
@ -47,6 +47,6 @@ define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 add
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="1" }
|
||||
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
|
||||
attributes #1 = { nounwind readnone }
|
||||
attributes #2 = { nounwind convergent }
|
||||
|
129
test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
Normal file
129
test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
Normal file
@ -0,0 +1,129 @@
|
||||
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: {{^}}min_64_max_64:
|
||||
; CHECK: SGPRBlocks: 0
|
||||
; CHECK: VGPRBlocks: 0
|
||||
; CHECK: NumSGPRsForWavesPerEU: 1
|
||||
; CHECK: NumVGPRsForWavesPerEU: 1
|
||||
define void @min_64_max_64() #0 {
|
||||
entry:
|
||||
ret void
|
||||
}
|
||||
attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
|
||||
|
||||
; CHECK-LABEL: {{^}}min_64_max_128:
|
||||
; CHECK: SGPRBlocks: 0
|
||||
; CHECK: VGPRBlocks: 0
|
||||
; CHECK: NumSGPRsForWavesPerEU: 1
|
||||
; CHECK: NumVGPRsForWavesPerEU: 1
|
||||
define void @min_64_max_128() #1 {
|
||||
entry:
|
||||
ret void
|
||||
}
|
||||
attributes #1 = {"amdgpu-flat-work-group-size"="64,128"}
|
||||
|
||||
; CHECK-LABEL: {{^}}min_128_max_128:
|
||||
; CHECK: SGPRBlocks: 0
|
||||
; CHECK: VGPRBlocks: 0
|
||||
; CHECK: NumSGPRsForWavesPerEU: 1
|
||||
; CHECK: NumVGPRsForWavesPerEU: 1
|
||||
define void @min_128_max_128() #2 {
|
||||
entry:
|
||||
ret void
|
||||
}
|
||||
attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
|
||||
|
||||
; CHECK-LABEL: {{^}}min_1024_max_2048
|
||||
; CHECK: SGPRBlocks: 2
|
||||
; CHECK: VGPRBlocks: 7
|
||||
; CHECK: NumSGPRsForWavesPerEU: 19
|
||||
; CHECK: NumVGPRsForWavesPerEU: 32
|
||||
@var = addrspace(1) global float 0.0
|
||||
define void @min_1024_max_2048() #3 {
|
||||
%val0 = load volatile float, float addrspace(1)* @var
|
||||
%val1 = load volatile float, float addrspace(1)* @var
|
||||
%val2 = load volatile float, float addrspace(1)* @var
|
||||
%val3 = load volatile float, float addrspace(1)* @var
|
||||
%val4 = load volatile float, float addrspace(1)* @var
|
||||
%val5 = load volatile float, float addrspace(1)* @var
|
||||
%val6 = load volatile float, float addrspace(1)* @var
|
||||
%val7 = load volatile float, float addrspace(1)* @var
|
||||
%val8 = load volatile float, float addrspace(1)* @var
|
||||
%val9 = load volatile float, float addrspace(1)* @var
|
||||
%val10 = load volatile float, float addrspace(1)* @var
|
||||
%val11 = load volatile float, float addrspace(1)* @var
|
||||
%val12 = load volatile float, float addrspace(1)* @var
|
||||
%val13 = load volatile float, float addrspace(1)* @var
|
||||
%val14 = load volatile float, float addrspace(1)* @var
|
||||
%val15 = load volatile float, float addrspace(1)* @var
|
||||
%val16 = load volatile float, float addrspace(1)* @var
|
||||
%val17 = load volatile float, float addrspace(1)* @var
|
||||
%val18 = load volatile float, float addrspace(1)* @var
|
||||
%val19 = load volatile float, float addrspace(1)* @var
|
||||
%val20 = load volatile float, float addrspace(1)* @var
|
||||
%val21 = load volatile float, float addrspace(1)* @var
|
||||
%val22 = load volatile float, float addrspace(1)* @var
|
||||
%val23 = load volatile float, float addrspace(1)* @var
|
||||
%val24 = load volatile float, float addrspace(1)* @var
|
||||
%val25 = load volatile float, float addrspace(1)* @var
|
||||
%val26 = load volatile float, float addrspace(1)* @var
|
||||
%val27 = load volatile float, float addrspace(1)* @var
|
||||
%val28 = load volatile float, float addrspace(1)* @var
|
||||
%val29 = load volatile float, float addrspace(1)* @var
|
||||
%val30 = load volatile float, float addrspace(1)* @var
|
||||
%val31 = load volatile float, float addrspace(1)* @var
|
||||
%val32 = load volatile float, float addrspace(1)* @var
|
||||
%val33 = load volatile float, float addrspace(1)* @var
|
||||
%val34 = load volatile float, float addrspace(1)* @var
|
||||
%val35 = load volatile float, float addrspace(1)* @var
|
||||
%val36 = load volatile float, float addrspace(1)* @var
|
||||
%val37 = load volatile float, float addrspace(1)* @var
|
||||
%val38 = load volatile float, float addrspace(1)* @var
|
||||
%val39 = load volatile float, float addrspace(1)* @var
|
||||
%val40 = load volatile float, float addrspace(1)* @var
|
||||
|
||||
store volatile float %val0, float addrspace(1)* @var
|
||||
store volatile float %val1, float addrspace(1)* @var
|
||||
store volatile float %val2, float addrspace(1)* @var
|
||||
store volatile float %val3, float addrspace(1)* @var
|
||||
store volatile float %val4, float addrspace(1)* @var
|
||||
store volatile float %val5, float addrspace(1)* @var
|
||||
store volatile float %val6, float addrspace(1)* @var
|
||||
store volatile float %val7, float addrspace(1)* @var
|
||||
store volatile float %val8, float addrspace(1)* @var
|
||||
store volatile float %val9, float addrspace(1)* @var
|
||||
store volatile float %val10, float addrspace(1)* @var
|
||||
store volatile float %val11, float addrspace(1)* @var
|
||||
store volatile float %val12, float addrspace(1)* @var
|
||||
store volatile float %val13, float addrspace(1)* @var
|
||||
store volatile float %val14, float addrspace(1)* @var
|
||||
store volatile float %val15, float addrspace(1)* @var
|
||||
store volatile float %val16, float addrspace(1)* @var
|
||||
store volatile float %val17, float addrspace(1)* @var
|
||||
store volatile float %val18, float addrspace(1)* @var
|
||||
store volatile float %val19, float addrspace(1)* @var
|
||||
store volatile float %val20, float addrspace(1)* @var
|
||||
store volatile float %val21, float addrspace(1)* @var
|
||||
store volatile float %val22, float addrspace(1)* @var
|
||||
store volatile float %val23, float addrspace(1)* @var
|
||||
store volatile float %val24, float addrspace(1)* @var
|
||||
store volatile float %val25, float addrspace(1)* @var
|
||||
store volatile float %val26, float addrspace(1)* @var
|
||||
store volatile float %val27, float addrspace(1)* @var
|
||||
store volatile float %val28, float addrspace(1)* @var
|
||||
store volatile float %val29, float addrspace(1)* @var
|
||||
store volatile float %val30, float addrspace(1)* @var
|
||||
store volatile float %val31, float addrspace(1)* @var
|
||||
store volatile float %val32, float addrspace(1)* @var
|
||||
store volatile float %val33, float addrspace(1)* @var
|
||||
store volatile float %val34, float addrspace(1)* @var
|
||||
store volatile float %val35, float addrspace(1)* @var
|
||||
store volatile float %val36, float addrspace(1)* @var
|
||||
store volatile float %val37, float addrspace(1)* @var
|
||||
store volatile float %val38, float addrspace(1)* @var
|
||||
store volatile float %val39, float addrspace(1)* @var
|
||||
store volatile float %val40, float addrspace(1)* @var
|
||||
|
||||
ret void
|
||||
}
|
||||
attributes #3 = {"amdgpu-flat-work-group-size"="1024,2048"}
|
17
test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
Normal file
17
test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
Normal file
@ -0,0 +1,17 @@
|
||||
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: {{^}}max_18_sgprs:
|
||||
; CHECK: SGPRBlocks: 1
|
||||
; CHECK: NumSGPRsForWavesPerEU: 13
|
||||
define void @max_18_sgprs(i32 addrspace(1)* %out1,
|
||||
i32 addrspace(1)* %out2,
|
||||
i32 addrspace(1)* %out3,
|
||||
i32 addrspace(1)* %out4,
|
||||
i32 %one, i32 %two, i32 %three, i32 %four) #0 {
|
||||
store i32 %one, i32 addrspace(1)* %out1
|
||||
store i32 %two, i32 addrspace(1)* %out2
|
||||
store i32 %three, i32 addrspace(1)* %out3
|
||||
store i32 %four, i32 addrspace(1)* %out4
|
||||
ret void
|
||||
}
|
||||
attributes #0 = {"amdgpu-num-sgpr"="18"}
|
75
test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
Normal file
75
test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
Normal file
@ -0,0 +1,75 @@
|
||||
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
@var = addrspace(1) global float 0.0
|
||||
|
||||
; CHECK-LABEL: {{^}}max_20_vgprs:
|
||||
; CHECK: VGPRBlocks: 4
|
||||
; CHECK: NumVGPRsForWavesPerEU: 20
|
||||
define void @max_20_vgprs() #1 {
|
||||
%val0 = load volatile float, float addrspace(1)* @var
|
||||
%val1 = load volatile float, float addrspace(1)* @var
|
||||
%val2 = load volatile float, float addrspace(1)* @var
|
||||
%val3 = load volatile float, float addrspace(1)* @var
|
||||
%val4 = load volatile float, float addrspace(1)* @var
|
||||
%val5 = load volatile float, float addrspace(1)* @var
|
||||
%val6 = load volatile float, float addrspace(1)* @var
|
||||
%val7 = load volatile float, float addrspace(1)* @var
|
||||
%val8 = load volatile float, float addrspace(1)* @var
|
||||
%val9 = load volatile float, float addrspace(1)* @var
|
||||
%val10 = load volatile float, float addrspace(1)* @var
|
||||
%val11 = load volatile float, float addrspace(1)* @var
|
||||
%val12 = load volatile float, float addrspace(1)* @var
|
||||
%val13 = load volatile float, float addrspace(1)* @var
|
||||
%val14 = load volatile float, float addrspace(1)* @var
|
||||
%val15 = load volatile float, float addrspace(1)* @var
|
||||
%val16 = load volatile float, float addrspace(1)* @var
|
||||
%val17 = load volatile float, float addrspace(1)* @var
|
||||
%val18 = load volatile float, float addrspace(1)* @var
|
||||
%val19 = load volatile float, float addrspace(1)* @var
|
||||
%val20 = load volatile float, float addrspace(1)* @var
|
||||
%val21 = load volatile float, float addrspace(1)* @var
|
||||
%val22 = load volatile float, float addrspace(1)* @var
|
||||
%val23 = load volatile float, float addrspace(1)* @var
|
||||
%val24 = load volatile float, float addrspace(1)* @var
|
||||
%val25 = load volatile float, float addrspace(1)* @var
|
||||
%val26 = load volatile float, float addrspace(1)* @var
|
||||
%val27 = load volatile float, float addrspace(1)* @var
|
||||
%val28 = load volatile float, float addrspace(1)* @var
|
||||
%val29 = load volatile float, float addrspace(1)* @var
|
||||
%val30 = load volatile float, float addrspace(1)* @var
|
||||
|
||||
store volatile float %val0, float addrspace(1)* @var
|
||||
store volatile float %val1, float addrspace(1)* @var
|
||||
store volatile float %val2, float addrspace(1)* @var
|
||||
store volatile float %val3, float addrspace(1)* @var
|
||||
store volatile float %val4, float addrspace(1)* @var
|
||||
store volatile float %val5, float addrspace(1)* @var
|
||||
store volatile float %val6, float addrspace(1)* @var
|
||||
store volatile float %val7, float addrspace(1)* @var
|
||||
store volatile float %val8, float addrspace(1)* @var
|
||||
store volatile float %val9, float addrspace(1)* @var
|
||||
store volatile float %val10, float addrspace(1)* @var
|
||||
store volatile float %val11, float addrspace(1)* @var
|
||||
store volatile float %val12, float addrspace(1)* @var
|
||||
store volatile float %val13, float addrspace(1)* @var
|
||||
store volatile float %val14, float addrspace(1)* @var
|
||||
store volatile float %val15, float addrspace(1)* @var
|
||||
store volatile float %val16, float addrspace(1)* @var
|
||||
store volatile float %val17, float addrspace(1)* @var
|
||||
store volatile float %val18, float addrspace(1)* @var
|
||||
store volatile float %val19, float addrspace(1)* @var
|
||||
store volatile float %val20, float addrspace(1)* @var
|
||||
store volatile float %val21, float addrspace(1)* @var
|
||||
store volatile float %val22, float addrspace(1)* @var
|
||||
store volatile float %val23, float addrspace(1)* @var
|
||||
store volatile float %val24, float addrspace(1)* @var
|
||||
store volatile float %val25, float addrspace(1)* @var
|
||||
store volatile float %val26, float addrspace(1)* @var
|
||||
store volatile float %val27, float addrspace(1)* @var
|
||||
store volatile float %val28, float addrspace(1)* @var
|
||||
store volatile float %val29, float addrspace(1)* @var
|
||||
store volatile float %val30, float addrspace(1)* @var
|
||||
|
||||
ret void
|
||||
}
|
||||
attributes #1 = {"amdgpu-num-vgpr"="20"}
|
57
test/CodeGen/AMDGPU/attr-unparseable.ll
Normal file
57
test/CodeGen/AMDGPU/attr-unparseable.ll
Normal file
@ -0,0 +1,57 @@
|
||||
; RUN: not llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s 2>&1 | FileCheck %s
|
||||
|
||||
; CHECK: can't parse integer attribute amdgpu-num-sgpr
|
||||
define void @unparseable_single_0() #0 {
|
||||
entry:
|
||||
ret void
|
||||
}
|
||||
attributes #0 = {"amdgpu-num-sgpr"}
|
||||
|
||||
; CHECK: can't parse integer attribute amdgpu-num-sgpr
|
||||
define void @unparseable_single_1() #1 {
|
||||
entry:
|
||||
ret void
|
||||
}
|
||||
attributes #1 = {"amdgpu-num-sgpr"="k"}
|
||||
|
||||
; CHECK: can't parse integer attribute amdgpu-num-sgpr
|
||||
define void @unparseable_single_2() #2 {
|
||||
entry:
|
||||
ret void
|
||||
}
|
||||
attributes #2 = {"amdgpu-num-sgpr"="1,2"}
|
||||
|
||||
; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
|
||||
define void @unparseable_pair_0() #3 {
|
||||
entry:
|
||||
ret void
|
||||
}
|
||||
attributes #3 = {"amdgpu-flat-work-group-size"}
|
||||
|
||||
; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
|
||||
define void @unparseable_pair_1() #4 {
|
||||
entry:
|
||||
ret void
|
||||
}
|
||||
attributes #4 = {"amdgpu-flat-work-group-size"="k"}
|
||||
|
||||
; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
|
||||
define void @unparseable_pair_2() #5 {
|
||||
entry:
|
||||
ret void
|
||||
}
|
||||
attributes #5 = {"amdgpu-flat-work-group-size"="1"}
|
||||
|
||||
; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
|
||||
define void @unparseable_pair_3() #6 {
|
||||
entry:
|
||||
ret void
|
||||
}
|
||||
attributes #6 = {"amdgpu-flat-work-group-size"="1,k"}
|
||||
|
||||
; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
|
||||
define void @unparseable_pair_4() #7 {
|
||||
entry:
|
||||
ret void
|
||||
}
|
||||
attributes #7 = {"amdgpu-flat-work-group-size"="1,2,3"}
|
@ -121,4 +121,4 @@ define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <
|
||||
}
|
||||
|
||||
attributes #0 = { convergent nounwind }
|
||||
attributes #1 = { nounwind "amdgpu-max-waves-per-eu"="2" "amdgpu-max-work-group-size"="64" }
|
||||
attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="64,64" }
|
||||
|
@ -255,10 +255,10 @@ entry:
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }
|
||||
attributes #1 = { nounwind "amdgpu-max-waves-per-eu"="3" "amdgpu-max-work-group-size"="256" }
|
||||
attributes #2 = { nounwind "amdgpu-max-waves-per-eu"="1" "amdgpu-max-work-group-size"="1600" }
|
||||
attributes #3 = { nounwind "amdgpu-max-waves-per-eu"="0" }
|
||||
attributes #4 = { nounwind "amdgpu-max-waves-per-eu"="-1" }
|
||||
attributes #5 = { nounwind "amdgpu-max-waves-per-eu"="6" "amdgpu-max-work-group-size"="64" }
|
||||
attributes #6 = { nounwind "amdgpu-max-waves-per-eu"="8" "amdgpu-max-work-group-size"="64" }
|
||||
attributes #7 = { nounwind "amdgpu-max-waves-per-eu"="9" "amdgpu-max-work-group-size"="64" }
|
||||
attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,3" "amdgpu-flat-work-group-size"="256,256" }
|
||||
attributes #2 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1600,1600" }
|
||||
attributes #3 = { nounwind "amdgpu-waves-per-eu"="1,10" }
|
||||
attributes #4 = { nounwind "amdgpu-waves-per-eu"="1,10" }
|
||||
attributes #5 = { nounwind "amdgpu-waves-per-eu"="1,6" "amdgpu-flat-work-group-size"="64,64" }
|
||||
attributes #6 = { nounwind "amdgpu-waves-per-eu"="1,8" "amdgpu-flat-work-group-size"="64,64" }
|
||||
attributes #7 = { nounwind "amdgpu-waves-per-eu"="1,9" "amdgpu-flat-work-group-size"="64,64" }
|
||||
|
@ -1,41 +0,0 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -regalloc=basic -post-RA-scheduler=0 < %s | FileCheck %s
|
||||
|
||||
; CHECK: NumVgprs: 64
|
||||
define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, [16 x <8 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, <3 x i32> inreg, <3 x i32> inreg, <3 x i32>) #0 {
|
||||
main_body:
|
||||
%8 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %4, i64 0, i64 8
|
||||
%9 = load <4 x i32>, <4 x i32> addrspace(2)* %8, align 16, !tbaa !0
|
||||
%10 = extractelement <3 x i32> %7, i32 0
|
||||
%11 = extractelement <3 x i32> %7, i32 1
|
||||
%12 = mul i32 %10, %11
|
||||
%bc = bitcast <3 x i32> %7 to <3 x float>
|
||||
%13 = extractelement <3 x float> %bc, i32 1
|
||||
%14 = insertelement <512 x float> undef, float %13, i32 %12
|
||||
call void @llvm.amdgcn.s.barrier()
|
||||
%15 = extractelement <3 x i32> %6, i32 0
|
||||
%16 = extractelement <3 x i32> %7, i32 0
|
||||
%17 = shl i32 %15, 5
|
||||
%18 = add i32 %17, %16
|
||||
%19 = shl i32 %18, 4
|
||||
%20 = extractelement <3 x i32> %7, i32 1
|
||||
%21 = shl i32 %20, 2
|
||||
%22 = sext i32 %21 to i64
|
||||
%23 = getelementptr i8, i8 addrspace(3)* null, i64 %22
|
||||
%24 = bitcast i8 addrspace(3)* %23 to i32 addrspace(3)*
|
||||
%25 = load i32, i32 addrspace(3)* %24, align 4
|
||||
%26 = extractelement <512 x float> %14, i32 %25
|
||||
%27 = insertelement <4 x float> undef, float %26, i32 0
|
||||
call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %27, <4 x i32> %9, i32 0, i32 %19, i1 false, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.s.barrier() #1
|
||||
|
||||
declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
|
||||
|
||||
attributes #0 = { "amdgpu-max-work-group-size"="1024" }
|
||||
attributes #1 = { convergent nounwind }
|
||||
attributes #2 = { nounwind }
|
||||
|
||||
!0 = !{!1, !1, i64 0, i32 1}
|
||||
!1 = !{!"const", null}
|
@ -343,8 +343,8 @@ define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <
|
||||
; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i32:
|
||||
; GCN-DAG: s_load_dwordx16
|
||||
; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
|
||||
; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
|
||||
; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||
; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
|
||||
|
||||
; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1
|
||||
; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
|
||||
|
@ -297,4 +297,4 @@ define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
|
||||
|
||||
; OPT: !0 = !{i32 0, i32 2048}
|
||||
|
||||
attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="2" }
|
||||
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
|
||||
|
@ -61,5 +61,5 @@ define void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" "amdgpu-max-waves-per-eu"="3" }
|
||||
attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,3" }
|
||||
attributes #1 = { nounwind readnone }
|
||||
|
@ -34,5 +34,5 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }
|
||||
attributes #1 = { nounwind optnone noinline "amdgpu-max-work-group-size"="64" }
|
||||
attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" }
|
||||
attributes #1 = { nounwind optnone noinline "amdgpu-flat-work-group-size"="64,64" }
|
||||
|
@ -127,4 +127,4 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind "amdgpu-max-work-group-size"="64" }
|
||||
attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-waves-per-eu"="1,7" }
|
||||
|
@ -61,4 +61,4 @@ define void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a
|
||||
|
||||
declare i32* @get_unknown_pointer() #0
|
||||
|
||||
attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="1" }
|
||||
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
|
||||
|
@ -201,4 +201,4 @@ for.body: ; preds = %for.body, %for.body
|
||||
|
||||
declare i32* @get_unknown_pointer() #0
|
||||
|
||||
attributes #0 = { nounwind "amdgpu-max-waves-per-eu"="1" }
|
||||
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
|
||||
|
@ -129,5 +129,5 @@ bb:
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { norecurse nounwind "amdgpu-max-waves-per-eu"="1" }
|
||||
attributes #0 = { norecurse nounwind "amdgpu-waves-per-eu"="1,1" }
|
||||
attributes #1 = { norecurse nounwind }
|
@ -108,5 +108,5 @@ attributes #1 = { nounwind readnone }
|
||||
attributes #2 = { nounwind "target-cpu"="tahiti" }
|
||||
attributes #3 = { nounwind "target-cpu"="bonaire" }
|
||||
attributes #4 = { nounwind "target-cpu"="fiji" }
|
||||
attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-max-waves-per-eu"="3" }
|
||||
attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-max-waves-per-eu"="3" }
|
||||
attributes #5 = { nounwind "target-features"="+promote-alloca" "amdgpu-waves-per-eu"="1,3" }
|
||||
attributes #6 = { nounwind "target-features"="-promote-alloca" "amdgpu-waves-per-eu"="1,3" }
|
||||
|
Loading…
Reference in New Issue
Block a user