mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 18:54:02 +01:00
AMDGPU: Start adding tail call support
Handle the sibling call cases. llvm-svn: 310753
This commit is contained in:
parent
95f9246136
commit
09562e957d
@ -631,10 +631,12 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
|
||||
}
|
||||
|
||||
if (MI.isCall()) {
|
||||
assert(MI.getOpcode() == AMDGPU::SI_CALL);
|
||||
// Pseudo used just to encode the underlying global. Is there a better
|
||||
// way to track this?
|
||||
const Function *Callee = cast<Function>(MI.getOperand(2).getGlobal());
|
||||
|
||||
const MachineOperand *CalleeOp
|
||||
= TII->getNamedOperand(MI, AMDGPU::OpName::callee);
|
||||
const Function *Callee = cast<Function>(CalleeOp->getGlobal());
|
||||
if (Callee->isDeclaration()) {
|
||||
// If this is a call to an external function, we can't do much. Make
|
||||
// conservative guesses.
|
||||
|
@ -1001,6 +1001,42 @@ CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
|
||||
return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
|
||||
SelectionDAG &DAG,
|
||||
MachineFrameInfo &MFI,
|
||||
int ClobberedFI) const {
|
||||
SmallVector<SDValue, 8> ArgChains;
|
||||
int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
|
||||
int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
|
||||
|
||||
// Include the original chain at the beginning of the list. When this is
|
||||
// used by target LowerCall hooks, this helps legalize find the
|
||||
// CALLSEQ_BEGIN node.
|
||||
ArgChains.push_back(Chain);
|
||||
|
||||
// Add a chain value for each stack argument corresponding
|
||||
for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
|
||||
UE = DAG.getEntryNode().getNode()->use_end();
|
||||
U != UE; ++U) {
|
||||
if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
|
||||
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
|
||||
if (FI->getIndex() < 0) {
|
||||
int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
|
||||
int64_t InLastByte = InFirstByte;
|
||||
InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
|
||||
|
||||
if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
|
||||
(FirstByte <= InFirstByte && InFirstByte <= LastByte))
|
||||
ArgChains.push_back(SDValue(L, 1));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build a tokenfactor for all the chains.
|
||||
return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
|
||||
SmallVectorImpl<SDValue> &InVals,
|
||||
StringRef Reason) const {
|
||||
@ -3658,6 +3694,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
NODE_NAME_CASE(ELSE)
|
||||
NODE_NAME_CASE(LOOP)
|
||||
NODE_NAME_CASE(CALL)
|
||||
NODE_NAME_CASE(TC_RETURN)
|
||||
NODE_NAME_CASE(TRAP)
|
||||
NODE_NAME_CASE(RET_FLAG)
|
||||
NODE_NAME_CASE(RETURN_TO_EPILOG)
|
||||
|
@ -172,6 +172,11 @@ public:
|
||||
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
|
||||
SelectionDAG &DAG) const override;
|
||||
|
||||
SDValue addTokenForArgument(SDValue Chain,
|
||||
SelectionDAG &DAG,
|
||||
MachineFrameInfo &MFI,
|
||||
int ClobberedFI) const;
|
||||
|
||||
SDValue lowerUnhandledCall(CallLoweringInfo &CLI,
|
||||
SmallVectorImpl<SDValue> &InVals,
|
||||
StringRef Reason) const;
|
||||
@ -291,6 +296,7 @@ enum NodeType : unsigned {
|
||||
|
||||
// Function call.
|
||||
CALL,
|
||||
TC_RETURN,
|
||||
TRAP,
|
||||
|
||||
// Masked control flow nodes.
|
||||
|
@ -74,6 +74,8 @@ def AMDGPUAddeSubeOp : SDTypeProfile<2, 3,
|
||||
[SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisVT<0, i32>, SDTCisVT<1, i1>, SDTCisVT<4, i1>]
|
||||
>;
|
||||
|
||||
def SDT_AMDGPUTCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// AMDGPU DAG Nodes
|
||||
//
|
||||
@ -98,6 +100,10 @@ def AMDGPUcall : SDNode<"AMDGPUISD::CALL",
|
||||
SDNPVariadic]
|
||||
>;
|
||||
|
||||
def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", SDT_AMDGPUTCRET,
|
||||
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
|
||||
>;
|
||||
|
||||
def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
|
||||
SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
|
||||
[SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]
|
||||
|
@ -146,6 +146,9 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
|
||||
OutMI.addOperand(Dest);
|
||||
OutMI.addOperand(Src);
|
||||
return;
|
||||
} else if (Opcode == AMDGPU::SI_TCRETURN) {
|
||||
// TODO: How to use branch immediate and avoid register+add?
|
||||
Opcode = AMDGPU::S_SETPC_B64;
|
||||
}
|
||||
|
||||
int MCOpcode = TII->pseudoToMCOpcode(Opcode);
|
||||
|
@ -32,6 +32,7 @@
|
||||
#include "llvm/ADT/ArrayRef.h"
|
||||
#include "llvm/ADT/BitVector.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/Statistic.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/ADT/StringSwitch.h"
|
||||
#include "llvm/ADT/Twine.h"
|
||||
@ -84,6 +85,10 @@
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "si-lower"
|
||||
|
||||
STATISTIC(NumTailCalls, "Number of tail calls");
|
||||
|
||||
static cl::opt<bool> EnableVGPRIndexMode(
|
||||
"amdgpu-vgpr-index-mode",
|
||||
cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
|
||||
@ -1647,6 +1652,9 @@ SDValue SITargetLowering::LowerFormalArguments(
|
||||
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
|
||||
ArgUsageInfo.setFuncArgInfo(*MF.getFunction(), Info->getArgInfo());
|
||||
|
||||
unsigned StackArgSize = CCInfo.getNextStackOffset();
|
||||
Info->setBytesInStackArgArea(StackArgSize);
|
||||
|
||||
return Chains.empty() ? Chain :
|
||||
DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
|
||||
}
|
||||
@ -1955,6 +1963,103 @@ void SITargetLowering::passSpecialInputs(
|
||||
}
|
||||
}
|
||||
|
||||
static bool canGuaranteeTCO(CallingConv::ID CC) {
|
||||
return CC == CallingConv::Fast;
|
||||
}
|
||||
|
||||
/// Return true if we might ever do TCO for calls with this calling convention.
|
||||
static bool mayTailCallThisCC(CallingConv::ID CC) {
|
||||
switch (CC) {
|
||||
case CallingConv::C:
|
||||
return true;
|
||||
default:
|
||||
return canGuaranteeTCO(CC);
|
||||
}
|
||||
}
|
||||
|
||||
bool SITargetLowering::isEligibleForTailCallOptimization(
|
||||
SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
|
||||
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
||||
const SmallVectorImpl<SDValue> &OutVals,
|
||||
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
|
||||
if (!mayTailCallThisCC(CalleeCC))
|
||||
return false;
|
||||
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
const Function *CallerF = MF.getFunction();
|
||||
CallingConv::ID CallerCC = CallerF->getCallingConv();
|
||||
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
|
||||
const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
|
||||
|
||||
// Kernels aren't callable, and don't have a live in return address so it
|
||||
// doesn't make sense to do a tail call with entry functions.
|
||||
if (!CallerPreserved)
|
||||
return false;
|
||||
|
||||
bool CCMatch = CallerCC == CalleeCC;
|
||||
|
||||
if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
|
||||
if (canGuaranteeTCO(CalleeCC) && CCMatch)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// TODO: Can we handle var args?
|
||||
if (IsVarArg)
|
||||
return false;
|
||||
|
||||
for (const Argument &Arg : CallerF->args()) {
|
||||
if (Arg.hasByValAttr())
|
||||
return false;
|
||||
}
|
||||
|
||||
LLVMContext &Ctx = *DAG.getContext();
|
||||
|
||||
// Check that the call results are passed in the same way.
|
||||
if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
|
||||
CCAssignFnForCall(CalleeCC, IsVarArg),
|
||||
CCAssignFnForCall(CallerCC, IsVarArg)))
|
||||
return false;
|
||||
|
||||
// The callee has to preserve all registers the caller needs to preserve.
|
||||
if (!CCMatch) {
|
||||
const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
|
||||
if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
|
||||
return false;
|
||||
}
|
||||
|
||||
// Nothing more to check if the callee is taking no arguments.
|
||||
if (Outs.empty())
|
||||
return true;
|
||||
|
||||
SmallVector<CCValAssign, 16> ArgLocs;
|
||||
CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
|
||||
|
||||
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
|
||||
|
||||
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
|
||||
// If the stack arguments for this call do not fit into our own save area then
|
||||
// the call cannot be made tail.
|
||||
// TODO: Is this really necessary?
|
||||
if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
|
||||
return false;
|
||||
|
||||
const MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
|
||||
}
|
||||
|
||||
bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
|
||||
if (!CI->isTailCall())
|
||||
return false;
|
||||
|
||||
const Function *ParentFn = CI->getParent()->getParent();
|
||||
if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
|
||||
return false;
|
||||
|
||||
auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
|
||||
return (Attr.getValueAsString() != "true");
|
||||
}
|
||||
|
||||
// The wave scratch offset register is used as the global base pointer.
|
||||
SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
SmallVectorImpl<SDValue> &InVals) const {
|
||||
@ -1987,8 +2092,27 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
"unsupported required tail call to function ");
|
||||
}
|
||||
|
||||
// TODO: Implement tail calls.
|
||||
IsTailCall = false;
|
||||
// The first 4 bytes are reserved for the callee's emergency stack slot.
|
||||
const unsigned CalleeUsableStackOffset = 4;
|
||||
|
||||
if (IsTailCall) {
|
||||
IsTailCall = isEligibleForTailCallOptimization(
|
||||
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
|
||||
if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
|
||||
report_fatal_error("failed to perform tail call elimination on a call "
|
||||
"site marked musttail");
|
||||
}
|
||||
|
||||
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
|
||||
|
||||
// A sibling call is one where we're under the usual C ABI and not planning
|
||||
// to change that but can still do a tail call:
|
||||
if (!TailCallOpt && IsTailCall)
|
||||
IsSibCall = true;
|
||||
|
||||
if (IsTailCall)
|
||||
++NumTailCalls;
|
||||
}
|
||||
|
||||
if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
|
||||
// FIXME: Remove this hack for function pointer types.
|
||||
@ -2020,8 +2144,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
// by this amount for a tail call. In a sibling call it must be 0 because the
|
||||
// caller will deallocate the entire stack and the callee still expects its
|
||||
// arguments to begin at SP+0. Completely unused for non-tail calls.
|
||||
int FPDiff = 0;
|
||||
|
||||
int32_t FPDiff = 0;
|
||||
MachineFrameInfo &MFI = MF.getFrameInfo();
|
||||
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
|
||||
|
||||
// Adjust the stack pointer for the new arguments...
|
||||
@ -2044,9 +2168,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
|
||||
// Stack pointer relative accesses are done by changing the offset SGPR. This
|
||||
// is just the VGPR offset component.
|
||||
|
||||
// The first 4 bytes are reserved for the callee's emergency stack slot.
|
||||
SDValue StackPtr = DAG.getConstant(4, DL, MVT::i32);
|
||||
SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
|
||||
|
||||
SmallVector<SDValue, 8> MemOpChains;
|
||||
MVT PtrVT = MVT::i32;
|
||||
@ -2093,10 +2215,28 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
SDValue PtrOff = DAG.getConstant(Offset, DL, MVT::i32);
|
||||
PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
|
||||
|
||||
if (!IsTailCall) {
|
||||
SDValue PtrOff = DAG.getTargetConstant(Offset, DL, MVT::i32);
|
||||
if (IsTailCall) {
|
||||
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
|
||||
unsigned OpSize = Flags.isByVal() ?
|
||||
Flags.getByValSize() : VA.getValVT().getStoreSize();
|
||||
|
||||
DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
|
||||
Offset = Offset + FPDiff;
|
||||
int FI = MFI.CreateFixedObject(OpSize, Offset, true);
|
||||
|
||||
DstAddr = DAG.getFrameIndex(FI, PtrVT);
|
||||
DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, DstAddr, StackPtr);
|
||||
DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
|
||||
|
||||
// Make sure any stack arguments overlapping with where we're storing
|
||||
// are loaded before this eventual operation. Otherwise they'll be
|
||||
// clobbered.
|
||||
|
||||
// FIXME: Why is this really necessary? This seems to just result in a
|
||||
// lot of code to copy the stack and write them back to the same
|
||||
// locations, which are supposed to be immutable?
|
||||
Chain = addTokenForArgument(Chain, DAG, MFI, FI);
|
||||
} else {
|
||||
DstAddr = PtrOff;
|
||||
DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
|
||||
}
|
||||
|
||||
@ -2132,6 +2272,22 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
InFlag = Chain.getValue(1);
|
||||
}
|
||||
|
||||
|
||||
SDValue PhysReturnAddrReg;
|
||||
if (IsTailCall) {
|
||||
// Since the return is being combined with the call, we need to pass on the
|
||||
// return address.
|
||||
|
||||
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
|
||||
SDValue ReturnAddrReg = CreateLiveInRegister(
|
||||
DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
|
||||
|
||||
PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
|
||||
MVT::i64);
|
||||
Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
|
||||
InFlag = Chain.getValue(1);
|
||||
}
|
||||
|
||||
// We don't usually want to end the call-sequence here because we would tidy
|
||||
// the frame up *after* the call, however in the ABI-changing tail-call case
|
||||
// we've carefully laid out the parameters so that when sp is reset they'll be
|
||||
@ -2153,6 +2309,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
// this information must travel along with the operation for eventual
|
||||
// consumption by emitEpilogue.
|
||||
Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
|
||||
|
||||
Ops.push_back(PhysReturnAddrReg);
|
||||
}
|
||||
|
||||
// Add argument registers to the end of the list so that they are known live
|
||||
@ -2177,8 +2335,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
// If we're doing a tall call, use a TC_RETURN here rather than an
|
||||
// actual call instruction.
|
||||
if (IsTailCall) {
|
||||
MF.getFrameInfo().setHasTailCall();
|
||||
llvm_unreachable("not implemented");
|
||||
MFI.setHasTailCall();
|
||||
return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
|
||||
}
|
||||
|
||||
// Returns a chain and a flag for retval copy to use.
|
||||
@ -2873,7 +3031,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
|
||||
.addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
|
||||
return BB;
|
||||
}
|
||||
case AMDGPU::SI_CALL_ISEL: {
|
||||
case AMDGPU::SI_CALL_ISEL:
|
||||
case AMDGPU::SI_TCRETURN_ISEL: {
|
||||
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
|
||||
@ -2885,17 +3044,24 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
|
||||
|
||||
const GlobalValue *G = PCRel->getOperand(1).getGlobal();
|
||||
|
||||
MachineInstrBuilder MIB =
|
||||
BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
|
||||
.add(MI.getOperand(0))
|
||||
.addGlobalAddress(G);
|
||||
MachineInstrBuilder MIB;
|
||||
if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
|
||||
MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
|
||||
.add(MI.getOperand(0))
|
||||
.addGlobalAddress(G);
|
||||
} else {
|
||||
MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
|
||||
.add(MI.getOperand(0))
|
||||
.addGlobalAddress(G);
|
||||
|
||||
// There is an additional imm operand for tcreturn, but it should be in the
|
||||
// right place already.
|
||||
}
|
||||
|
||||
for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
|
||||
MIB.add(MI.getOperand(I));
|
||||
|
||||
|
||||
MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
|
||||
|
||||
MI.eraseFromParent();
|
||||
return BB;
|
||||
}
|
||||
|
@ -224,6 +224,15 @@ public:
|
||||
const SDLoc &DL, SelectionDAG &DAG,
|
||||
SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
|
||||
SDValue ThisVal) const;
|
||||
|
||||
bool mayBeEmittedAsTailCall(const CallInst *) const override;
|
||||
|
||||
bool isEligibleForTailCallOptimization(
|
||||
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
|
||||
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
||||
const SmallVectorImpl<SDValue> &OutVals,
|
||||
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
|
||||
|
||||
SDValue LowerCall(CallLoweringInfo &CLI,
|
||||
SmallVectorImpl<SDValue> &InVals) const override;
|
||||
|
||||
|
@ -378,6 +378,31 @@ def SI_CALL : SPseudoInstSI <
|
||||
(outs SReg_64:$dst), (ins SSrc_b64:$src0, unknown:$callee)> {
|
||||
let Size = 4;
|
||||
let isCall = 1;
|
||||
let UseNamedOperandTable = 1;
|
||||
let SchedRW = [WriteBranch];
|
||||
}
|
||||
|
||||
// Tail call handling pseudo
|
||||
def SI_TCRETURN_ISEL : SPseudoInstSI<(outs),
|
||||
(ins SSrc_b64:$src0, i32imm:$fpdiff),
|
||||
[(AMDGPUtc_return i64:$src0, i32:$fpdiff)]> {
|
||||
let isCall = 1;
|
||||
let isTerminator = 1;
|
||||
let isReturn = 1;
|
||||
let isBarrier = 1;
|
||||
let SchedRW = [WriteBranch];
|
||||
let usesCustomInserter = 1;
|
||||
}
|
||||
|
||||
def SI_TCRETURN : SPseudoInstSI <
|
||||
(outs),
|
||||
(ins SSrc_b64:$src0, unknown:$callee, i32imm:$fpdiff)> {
|
||||
let Size = 4;
|
||||
let isCall = 1;
|
||||
let isTerminator = 1;
|
||||
let isReturn = 1;
|
||||
let isBarrier = 1;
|
||||
let UseNamedOperandTable = 1;
|
||||
let SchedRW = [WriteBranch];
|
||||
}
|
||||
|
||||
|
@ -110,6 +110,17 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
|
||||
unsigned PSInputAddr = 0;
|
||||
unsigned PSInputEnable = 0;
|
||||
|
||||
/// Number of bytes of arguments this function has on the stack. If the callee
|
||||
/// is expected to restore the argument stack this should be a multiple of 16,
|
||||
/// all usable during a tail call.
|
||||
///
|
||||
/// The alternative would forbid tail call optimisation in some cases: if we
|
||||
/// want to transfer control from a function with 8-bytes of stack-argument
|
||||
/// space to a function with 16-bytes then misalignment of this value would
|
||||
/// make a stack adjustment necessary, which could not be undone by the
|
||||
/// callee.
|
||||
unsigned BytesInStackArgArea = 0;
|
||||
|
||||
bool ReturnsVoid = true;
|
||||
|
||||
// A pair of default/requested minimum/maximum flat work group sizes.
|
||||
@ -235,6 +246,14 @@ public:
|
||||
unsigned getTIDReg() const { return TIDReg; }
|
||||
void setTIDReg(unsigned Reg) { TIDReg = Reg; }
|
||||
|
||||
unsigned getBytesInStackArgArea() const {
|
||||
return BytesInStackArgArea;
|
||||
}
|
||||
|
||||
void setBytesInStackArgArea(unsigned Bytes) {
|
||||
BytesInStackArgArea = Bytes;
|
||||
}
|
||||
|
||||
// Add user SGPRs.
|
||||
unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
|
||||
unsigned addDispatchPtr(const SIRegisterInfo &TRI);
|
||||
|
225
test/CodeGen/AMDGPU/sibling-call.ll
Normal file
225
test/CodeGen/AMDGPU/sibling-call.ll
Normal file
@ -0,0 +1,225 @@
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s
|
||||
|
||||
; GCN-LABEL: {{^}}i32_fastcc_i32_i32:
|
||||
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
|
||||
%add0 = add i32 %arg0, %arg1
|
||||
ret i32 %add0
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32:
|
||||
define fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c) #1 {
|
||||
entry:
|
||||
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_stack_object:
|
||||
; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
|
||||
; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:24
|
||||
; GCN: s_setpc_b64
|
||||
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b, i32 %c) #1 {
|
||||
entry:
|
||||
%alloca = alloca [16 x i32], align 4
|
||||
%gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5
|
||||
store volatile i32 9, i32* %gep
|
||||
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_unused_result:
|
||||
define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
|
||||
entry:
|
||||
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
||||
ret void
|
||||
}
|
||||
|
||||
; It doesn't make sense to do a tail from a kernel
|
||||
; GCN-LABEL: {{^}}kernel_call_i32_fastcc_i32_i32_unused_result:
|
||||
;define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
|
||||
define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b, i32 %c) #1 {
|
||||
entry:
|
||||
%ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}i32_fastcc_i32_byval_i32:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: s_mov_b32 s5, s32
|
||||
; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
|
||||
; GCN-NEXT: s_setpc_b64 s[30:31]
|
||||
define fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, i32* byval align 4 %arg1) #1 {
|
||||
%arg1.load = load i32, i32* %arg1, align 4
|
||||
%add0 = add i32 %arg0, %arg1.load
|
||||
ret i32 %add0
|
||||
}
|
||||
|
||||
; Tail call disallowed with byval in parent.
|
||||
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
|
||||
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
|
||||
; GCN: s_swappc_b64
|
||||
; GCN: s_setpc_b64
|
||||
define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, i32* byval %b.byval, i32 %c) #1 {
|
||||
entry:
|
||||
%ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32* %b.byval)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
; Tail call disallowed with byval in parent, not callee.
|
||||
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32:
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: s32
|
||||
; GCN: buffer_load_dword v1, off, s[0:3], s4 offset:16
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; GCN: buffer_store_dword v1, off, s[0:3], s5 offset:4
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [16 x i32] %large) #1 {
|
||||
entry:
|
||||
%ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, i32* inttoptr (i32 16 to i32*))
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}i32_fastcc_i32_i32_a32i32:
|
||||
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4
|
||||
; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8
|
||||
; GCN-DAG: v_add_i32_e32 v0, vcc, v1, v0
|
||||
; GCN: v_add_i32_e32 v0, vcc, [[LOAD_0]], v0
|
||||
; GCN: v_add_i32_e32 v0, vcc, [[LOAD_1]], v0
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %large) #1 {
|
||||
%val_firststack = extractvalue [32 x i32] %large, 30
|
||||
%val_laststack = extractvalue [32 x i32] %large, 31
|
||||
%add0 = add i32 %arg0, %arg1
|
||||
%add1 = add i32 %add0, %val_firststack
|
||||
%add2 = add i32 %add1, %val_laststack
|
||||
ret i32 %add2
|
||||
}
|
||||
|
||||
; FIXME: Why load and store same location for stack args?
|
||||
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32:
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
|
||||
; GCN-DAG: buffer_store_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Spill
|
||||
; GCN-DAG: buffer_store_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill
|
||||
|
||||
; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s5 offset:4
|
||||
; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s5 offset:8
|
||||
|
||||
; GCN-NOT: s32
|
||||
|
||||
; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s5 offset:4
|
||||
; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s5 offset:8
|
||||
|
||||
; GCN-DAG: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload
|
||||
; GCN-DAG: buffer_load_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload
|
||||
|
||||
; GCN-NOT: s32
|
||||
; GCN: s_setpc_b64
|
||||
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
|
||||
entry:
|
||||
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
|
||||
; GCN-DAG: s_mov_b32 s5, s32
|
||||
; GCN-NOT: s32
|
||||
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
|
||||
; GCN: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:44
|
||||
|
||||
; GCN-NOT: s32
|
||||
; GCN: s_setpc_b64
|
||||
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
|
||||
entry:
|
||||
%alloca = alloca [16 x i32], align 4
|
||||
%gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5
|
||||
store volatile i32 9, i32* %gep
|
||||
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
; If the callee requires more stack argument space than the caller,
|
||||
; don't do a tail call.
|
||||
; TODO: Do we really need this restriction?
|
||||
|
||||
; GCN-LABEL: {{^}}no_sibling_call_callee_more_stack_space:
|
||||
; GCN: s_swappc_b64
|
||||
; GCN: s_setpc_b64
|
||||
define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
|
||||
entry:
|
||||
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
; Have another non-tail in the function
|
||||
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:12
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
|
||||
; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
|
||||
; GCN-DAG: v_writelane_b32 v34, s33, 0
|
||||
; GCN-DAG: v_writelane_b32 v34, s34, 1
|
||||
; GCN-DAG: v_writelane_b32 v34, s35, 2
|
||||
; GCN-DAG: s_add_u32 s32, s32, 0x400
|
||||
|
||||
; GCN: s_getpc_b64
|
||||
; GCN: s_swappc_b64
|
||||
|
||||
; GCN: s_getpc_b64 s[6:7]
|
||||
; GCN: s_add_u32 s6, s6, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
|
||||
; GCN: s_addc_u32 s7, s7, sibling_call_i32_fastcc_i32_i32@rel32@hi+4
|
||||
|
||||
; GCN-DAG: v_readlane_b32 s33, v34, 0
|
||||
; GCN-DAG: v_readlane_b32 s34, v34, 1
|
||||
; GCN-DAG: v_readlane_b32 s35, v34, 2
|
||||
|
||||
; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4
|
||||
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8
|
||||
; GCN: buffer_load_dword v34, off, s[0:3], s5 offset:12
|
||||
; GCN: s_sub_u32 s32, s32, 0x400
|
||||
; GCN: s_setpc_b64 s[6:7]
|
||||
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
|
||||
entry:
|
||||
%other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
|
||||
%ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
; Have stack object in caller and stack passed arguments. SP should be
|
||||
; in same place at function exit.
|
||||
|
||||
; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; GCN-NOT: s32
|
||||
; GCN: s_setpc_b64 s[6:7]
|
||||
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
|
||||
entry:
|
||||
%alloca = alloca [16 x i32], align 4
|
||||
%gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5
|
||||
store volatile i32 9, i32* %gep
|
||||
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
|
||||
; GCN: s_mov_b32 s5, s32
|
||||
; GCN-NOT: s32
|
||||
; GCN: s_setpc_b64 s[6:7]
|
||||
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area(i32 %a, i32 %b, [36 x i32] %c) #1 {
|
||||
entry:
|
||||
%alloca = alloca [16 x i32], align 4
|
||||
%gep = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 5
|
||||
store volatile i32 9, i32* %gep
|
||||
%ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
|
||||
ret i32 %ret
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind noinline }
|
43
test/CodeGen/AMDGPU/tail-call-cgp.ll
Normal file
43
test/CodeGen/AMDGPU/tail-call-cgp.ll
Normal file
@ -0,0 +1,43 @@
|
||||
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -codegenprepare %s | FileCheck %s
|
||||
|
||||
define internal fastcc void @callee(i32* nocapture %p, i32 %a) #0 {
|
||||
store volatile i32 %a, i32* %p, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @func_caller(
|
||||
; CHECK: tail call fastcc void @callee(
|
||||
; CHECK-NEXT: ret void
|
||||
; CHECK: ret void
|
||||
define void @func_caller(i32* nocapture %p, i32 %a, i32 %b) #0 {
|
||||
entry:
|
||||
%cmp = icmp eq i32 %b, 0
|
||||
br i1 %cmp, label %bb, label %ret
|
||||
|
||||
bb:
|
||||
tail call fastcc void @callee(i32* %p, i32 %a)
|
||||
br label %ret
|
||||
|
||||
ret:
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @kernel_caller(
|
||||
; CHECK: tail call fastcc void @callee(
|
||||
; CHECK-NEXT: br label %ret
|
||||
|
||||
; CHECK: ret void
|
||||
define amdgpu_kernel void @kernel_caller(i32* nocapture %p, i32 %a, i32 %b) #0 {
|
||||
entry:
|
||||
%cmp = icmp eq i32 %b, 0
|
||||
br i1 %cmp, label %bb, label %ret
|
||||
|
||||
bb:
|
||||
tail call fastcc void @callee(i32* %p, i32 %a)
|
||||
br label %ret
|
||||
|
||||
ret:
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
Loading…
Reference in New Issue
Block a user