mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 18:54:02 +01:00
AMDGPU: Start defining a calling convention
Partially implement callee-side for arguments and return values. byval doesn't work properly, and most likely sret or other on-stack return values most as well. llvm-svn: 303308
This commit is contained in:
parent
374681b328
commit
ab4fb8ba2f
@ -38,7 +38,8 @@ class AMDGPUCallLowering: public CallLowering {
|
||||
unsigned VReg) const override;
|
||||
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
|
||||
ArrayRef<unsigned> VRegs) const override;
|
||||
CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
|
||||
static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
|
||||
static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
|
||||
};
|
||||
} // End of namespace llvm;
|
||||
#endif
|
||||
|
@ -13,6 +13,8 @@
|
||||
|
||||
// Inversion of CCIfInReg
|
||||
class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
|
||||
class CCIfExtend<CCAction A>
|
||||
: CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
|
||||
|
||||
// Calling convention for SI
|
||||
def CC_SI : CallingConv<[
|
||||
@ -52,7 +54,7 @@ def CC_SI : CallingConv<[
|
||||
]>>>
|
||||
]>;
|
||||
|
||||
def RetCC_SI : CallingConv<[
|
||||
def RetCC_SI_Shader : CallingConv<[
|
||||
CCIfType<[i32] , CCAssignToReg<[
|
||||
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
|
||||
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
|
||||
@ -99,6 +101,52 @@ def CC_AMDGPU_Kernel : CallingConv<[
|
||||
CCCustom<"allocateKernArg">
|
||||
]>;
|
||||
|
||||
def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs<
|
||||
(sequence "VGPR%u", 24, 255)
|
||||
>;
|
||||
|
||||
def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
|
||||
(sequence "VGPR%u", 32, 255)
|
||||
>;
|
||||
|
||||
def CSR_AMDGPU_SGPRs_32_103 : CalleeSavedRegs<
|
||||
(sequence "SGPR%u", 32, 103)
|
||||
>;
|
||||
|
||||
def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
|
||||
(add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_103)
|
||||
>;
|
||||
|
||||
// Calling convention for leaf functions
|
||||
def CC_AMDGPU_Func : CallingConv<[
|
||||
CCIfByVal<CCPassByVal<4, 4>>,
|
||||
CCIfType<[i1], CCPromoteToType<i32>>,
|
||||
CCIfType<[i1, i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
|
||||
CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
|
||||
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
|
||||
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
|
||||
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
|
||||
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
|
||||
CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>,
|
||||
CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
|
||||
CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
|
||||
CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
|
||||
CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
|
||||
CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
|
||||
]>;
|
||||
|
||||
// Calling convention for leaf functions
|
||||
def RetCC_AMDGPU_Func : CallingConv<[
|
||||
CCIfType<[i1], CCPromoteToType<i32>>,
|
||||
CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
|
||||
CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[
|
||||
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
|
||||
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
|
||||
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
|
||||
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
|
||||
CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>
|
||||
]>;
|
||||
|
||||
def CC_AMDGPU : CallingConv<[
|
||||
CCIf<"static_cast<const AMDGPUSubtarget&>"
|
||||
"(State.getMachineFunction().getSubtarget()).getGeneration() >="
|
||||
|
@ -76,6 +76,45 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate up to VGPR31.
|
||||
//
|
||||
// TODO: Since there are no VGPR alignent requirements would it be better to
|
||||
// split into individual scalar registers?
|
||||
static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
|
||||
CCValAssign::LocInfo LocInfo,
|
||||
ISD::ArgFlagsTy ArgFlags, CCState &State) {
|
||||
switch (LocVT.SimpleTy) {
|
||||
case MVT::i64:
|
||||
case MVT::f64:
|
||||
case MVT::v2i32:
|
||||
case MVT::v2f32: {
|
||||
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
|
||||
&AMDGPU::VReg_64RegClass, 31);
|
||||
}
|
||||
case MVT::v4i32:
|
||||
case MVT::v4f32:
|
||||
case MVT::v2i64:
|
||||
case MVT::v2f64: {
|
||||
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
|
||||
&AMDGPU::VReg_128RegClass, 29);
|
||||
}
|
||||
case MVT::v8i32:
|
||||
case MVT::v8f32: {
|
||||
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
|
||||
&AMDGPU::VReg_256RegClass, 25);
|
||||
|
||||
}
|
||||
case MVT::v16i32:
|
||||
case MVT::v16f32: {
|
||||
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
|
||||
&AMDGPU::VReg_512RegClass, 17);
|
||||
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#include "AMDGPUGenCallingConv.inc"
|
||||
|
||||
// Find a larger type to do a load / store of a vector with.
|
||||
@ -773,8 +812,43 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
|
||||
bool IsVarArg) const {
|
||||
return CC_AMDGPU;
|
||||
bool IsVarArg) {
|
||||
switch (CC) {
|
||||
case CallingConv::AMDGPU_KERNEL:
|
||||
case CallingConv::SPIR_KERNEL:
|
||||
return CC_AMDGPU_Kernel;
|
||||
case CallingConv::AMDGPU_VS:
|
||||
case CallingConv::AMDGPU_GS:
|
||||
case CallingConv::AMDGPU_PS:
|
||||
case CallingConv::AMDGPU_CS:
|
||||
case CallingConv::AMDGPU_HS:
|
||||
return CC_AMDGPU;
|
||||
case CallingConv::C:
|
||||
case CallingConv::Fast:
|
||||
return CC_AMDGPU_Func;
|
||||
default:
|
||||
report_fatal_error("Unsupported calling convention.");
|
||||
}
|
||||
}
|
||||
|
||||
CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
|
||||
bool IsVarArg) {
|
||||
switch (CC) {
|
||||
case CallingConv::AMDGPU_KERNEL:
|
||||
case CallingConv::SPIR_KERNEL:
|
||||
return CC_AMDGPU_Kernel;
|
||||
case CallingConv::AMDGPU_VS:
|
||||
case CallingConv::AMDGPU_GS:
|
||||
case CallingConv::AMDGPU_PS:
|
||||
case CallingConv::AMDGPU_CS:
|
||||
case CallingConv::AMDGPU_HS:
|
||||
return RetCC_SI_Shader;
|
||||
case CallingConv::C:
|
||||
case CallingConv::Fast:
|
||||
return RetCC_AMDGPU_Func;
|
||||
default:
|
||||
report_fatal_error("Unsupported calling convention.");
|
||||
}
|
||||
}
|
||||
|
||||
/// The SelectionDAGBuilder will automatically promote function arguments
|
||||
@ -874,18 +948,15 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
|
||||
}
|
||||
}
|
||||
|
||||
void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
|
||||
const SmallVectorImpl<ISD::OutputArg> &Outs) const {
|
||||
|
||||
State.AnalyzeReturn(Outs, RetCC_SI);
|
||||
}
|
||||
|
||||
SDValue
|
||||
AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
||||
bool isVarArg,
|
||||
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
||||
const SmallVectorImpl<SDValue> &OutVals,
|
||||
const SDLoc &DL, SelectionDAG &DAG) const {
|
||||
SDValue AMDGPUTargetLowering::LowerReturn(
|
||||
SDValue Chain, CallingConv::ID CallConv,
|
||||
bool isVarArg,
|
||||
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
||||
const SmallVectorImpl<SDValue> &OutVals,
|
||||
const SDLoc &DL, SelectionDAG &DAG) const {
|
||||
// FIXME: Fails for r600 tests
|
||||
//assert(!isVarArg && Outs.empty() && OutVals.empty() &&
|
||||
// "wave terminate should not have return values");
|
||||
return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
|
||||
}
|
||||
|
||||
@ -896,20 +967,12 @@ AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
||||
/// Selects the correct CCAssignFn for a given CallingConvention value.
|
||||
CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
|
||||
bool IsVarArg) {
|
||||
switch (CC) {
|
||||
case CallingConv::C:
|
||||
case CallingConv::AMDGPU_KERNEL:
|
||||
case CallingConv::SPIR_KERNEL:
|
||||
return CC_AMDGPU_Kernel;
|
||||
case CallingConv::AMDGPU_VS:
|
||||
case CallingConv::AMDGPU_HS:
|
||||
case CallingConv::AMDGPU_GS:
|
||||
case CallingConv::AMDGPU_PS:
|
||||
case CallingConv::AMDGPU_CS:
|
||||
return CC_AMDGPU;
|
||||
default:
|
||||
report_fatal_error("Unsupported calling convention.");
|
||||
}
|
||||
return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
|
||||
}
|
||||
|
||||
CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
|
||||
bool IsVarArg) {
|
||||
return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
|
||||
}
|
||||
|
||||
SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
|
||||
|
@ -115,9 +115,6 @@ protected:
|
||||
SmallVectorImpl<SDValue> &Results) const;
|
||||
void analyzeFormalArgumentsCompute(CCState &State,
|
||||
const SmallVectorImpl<ISD::InputArg> &Ins) const;
|
||||
void AnalyzeReturn(CCState &State,
|
||||
const SmallVectorImpl<ISD::OutputArg> &Outs) const;
|
||||
|
||||
public:
|
||||
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
|
||||
|
||||
@ -164,6 +161,8 @@ public:
|
||||
bool isCheapToSpeculateCtlz() const override;
|
||||
|
||||
static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
|
||||
static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
|
||||
|
||||
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
|
||||
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
||||
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
|
||||
|
@ -380,6 +380,6 @@ def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
|
||||
def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
|
||||
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
|
||||
|
||||
def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
|
||||
def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
|
||||
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
|
||||
>;
|
||||
|
@ -126,9 +126,15 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
|
||||
}
|
||||
|
||||
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
|
||||
unsigned Opcode = MI->getOpcode();
|
||||
|
||||
int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
|
||||
// FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
|
||||
// need to select it to the subtarget specific version, and there's no way to
|
||||
// do that with a single pseudo source operation.
|
||||
if (Opcode == AMDGPU::S_SETPC_B64_return)
|
||||
Opcode = AMDGPU::S_SETPC_B64;
|
||||
|
||||
int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode);
|
||||
if (MCOpcode == -1) {
|
||||
LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
|
||||
C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
|
||||
|
@ -12,21 +12,6 @@
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
static bool isEntryFunctionCC(CallingConv::ID CC) {
|
||||
switch (CC) {
|
||||
case CallingConv::AMDGPU_KERNEL:
|
||||
case CallingConv::SPIR_KERNEL:
|
||||
case CallingConv::AMDGPU_VS:
|
||||
case CallingConv::AMDGPU_HS:
|
||||
case CallingConv::AMDGPU_GS:
|
||||
case CallingConv::AMDGPU_PS:
|
||||
case CallingConv::AMDGPU_CS:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
|
||||
MachineFunctionInfo(),
|
||||
LocalMemoryObjects(),
|
||||
@ -34,7 +19,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
|
||||
MaxKernArgAlign(0),
|
||||
LDSSize(0),
|
||||
ABIArgOffset(0),
|
||||
IsEntryFunction(isEntryFunctionCC(MF.getFunction()->getCallingConv())),
|
||||
IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction()->getCallingConv())),
|
||||
NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
|
||||
// FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
|
||||
// except reserved size is not correctly aligned.
|
||||
|
@ -14,6 +14,7 @@
|
||||
|
||||
#include "AMDGPURegisterInfo.h"
|
||||
#include "AMDGPUTargetMachine.h"
|
||||
#include "SIRegisterInfo.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
@ -24,18 +25,6 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
|
||||
// they are not supported at this time.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// Dummy to not crash RegisterClassInfo.
|
||||
static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
|
||||
|
||||
const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs(
|
||||
const MachineFunction *) const {
|
||||
return &CalleeSavedReg;
|
||||
}
|
||||
|
||||
unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
|
||||
return AMDGPU::NoRegister;
|
||||
}
|
||||
|
||||
unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
|
||||
static const unsigned SubRegs[] = {
|
||||
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
|
||||
@ -50,3 +39,35 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
|
||||
|
||||
#define GET_REGINFO_TARGET_DESC
|
||||
#include "AMDGPUGenRegisterInfo.inc"
|
||||
|
||||
|
||||
// Forced to be here by one .inc
|
||||
const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
|
||||
const MachineFunction *MF) const {
|
||||
CallingConv::ID CC = MF->getFunction()->getCallingConv();
|
||||
switch (CC) {
|
||||
case CallingConv::C:
|
||||
case CallingConv::Fast:
|
||||
return CSR_AMDGPU_HighRegs_SaveList;
|
||||
default: {
|
||||
// Dummy to not crash RegisterClassInfo.
|
||||
static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
|
||||
return &NoCalleeSavedReg;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
|
||||
CallingConv::ID CC) const {
|
||||
switch (CC) {
|
||||
case CallingConv::C:
|
||||
case CallingConv::Fast:
|
||||
return CSR_AMDGPU_HighRegs_RegMask;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
|
||||
return AMDGPU::NoRegister;
|
||||
}
|
||||
|
@ -30,9 +30,6 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
|
||||
/// \returns the sub reg enum value for the given \p Channel
|
||||
/// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
|
||||
unsigned getSubRegFromChannel(unsigned Channel) const;
|
||||
|
||||
const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
|
||||
unsigned getFrameRegister(const MachineFunction &MF) const override;
|
||||
};
|
||||
|
||||
} // End namespace llvm
|
||||
|
@ -56,6 +56,18 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
|
||||
return Reserved;
|
||||
}
|
||||
|
||||
// Dummy to not crash RegisterClassInfo.
|
||||
static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
|
||||
|
||||
const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
|
||||
const MachineFunction *) const {
|
||||
return &CalleeSavedReg;
|
||||
}
|
||||
|
||||
unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
|
||||
return AMDGPU::NoRegister;
|
||||
}
|
||||
|
||||
unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
|
||||
return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
|
||||
}
|
||||
|
@ -27,6 +27,8 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo {
|
||||
R600RegisterInfo();
|
||||
|
||||
BitVector getReservedRegs(const MachineFunction &MF) const override;
|
||||
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
|
||||
unsigned getFrameRegister(const MachineFunction &MF) const override;
|
||||
|
||||
/// \brief get the HW encoding for a register's channel.
|
||||
unsigned getHWRegChan(unsigned reg) const;
|
||||
|
@ -189,8 +189,6 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
|
||||
// ----
|
||||
// 13 (+1)
|
||||
unsigned ReservedRegCount = 13;
|
||||
if (SPReg != AMDGPU::NoRegister)
|
||||
++ReservedRegCount;
|
||||
|
||||
if (AllSGPRs.size() < ReservedRegCount)
|
||||
return std::make_pair(ScratchWaveOffsetReg, SPReg);
|
||||
@ -208,13 +206,6 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
|
||||
MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
|
||||
MFI->setScratchWaveOffsetReg(Reg);
|
||||
ScratchWaveOffsetReg = Reg;
|
||||
} else {
|
||||
if (SPReg == AMDGPU::NoRegister)
|
||||
break;
|
||||
|
||||
MRI.replaceRegWith(SPReg, Reg);
|
||||
MFI->setStackPtrOffsetReg(Reg);
|
||||
SPReg = Reg;
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -223,8 +214,8 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
|
||||
return std::make_pair(ScratchWaveOffsetReg, SPReg);
|
||||
}
|
||||
|
||||
void SIFrameLowering::emitPrologue(MachineFunction &MF,
|
||||
MachineBasicBlock &MBB) const {
|
||||
void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
|
||||
MachineBasicBlock &MBB) const {
|
||||
// Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
|
||||
// specified.
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
@ -424,6 +415,13 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
|
||||
}
|
||||
}
|
||||
|
||||
void SIFrameLowering::emitPrologue(MachineFunction &MF,
|
||||
MachineBasicBlock &MBB) const {
|
||||
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
if (MFI->isEntryFunction())
|
||||
emitEntryFunctionPrologue(MF, MBB);
|
||||
}
|
||||
|
||||
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
|
||||
MachineBasicBlock &MBB) const {
|
||||
|
||||
|
@ -26,6 +26,8 @@ public:
|
||||
AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
|
||||
~SIFrameLowering() override = default;
|
||||
|
||||
void emitEntryFunctionPrologue(MachineFunction &MF,
|
||||
MachineBasicBlock &MBB) const;
|
||||
void emitPrologue(MachineFunction &MF,
|
||||
MachineBasicBlock &MBB) const override;
|
||||
void emitEpilogue(MachineFunction &MF,
|
||||
|
@ -914,6 +914,55 @@ SDValue SITargetLowering::lowerKernargMemParameter(
|
||||
return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
|
||||
const SDLoc &SL, SDValue Chain,
|
||||
const ISD::InputArg &Arg) const {
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
MachineFrameInfo &MFI = MF.getFrameInfo();
|
||||
|
||||
if (Arg.Flags.isByVal()) {
|
||||
unsigned Size = Arg.Flags.getByValSize();
|
||||
int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
|
||||
return DAG.getFrameIndex(FrameIdx, MVT::i32);
|
||||
}
|
||||
|
||||
unsigned ArgOffset = VA.getLocMemOffset();
|
||||
unsigned ArgSize = VA.getValVT().getStoreSize();
|
||||
|
||||
int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
|
||||
|
||||
// Create load nodes to retrieve arguments from the stack.
|
||||
SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
|
||||
SDValue ArgValue;
|
||||
|
||||
// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
|
||||
ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
|
||||
MVT MemVT = VA.getValVT();
|
||||
|
||||
switch (VA.getLocInfo()) {
|
||||
default:
|
||||
break;
|
||||
case CCValAssign::BCvt:
|
||||
MemVT = VA.getLocVT();
|
||||
break;
|
||||
case CCValAssign::SExt:
|
||||
ExtType = ISD::SEXTLOAD;
|
||||
break;
|
||||
case CCValAssign::ZExt:
|
||||
ExtType = ISD::ZEXTLOAD;
|
||||
break;
|
||||
case CCValAssign::AExt:
|
||||
ExtType = ISD::EXTLOAD;
|
||||
break;
|
||||
}
|
||||
|
||||
ArgValue = DAG.getExtLoad(
|
||||
ExtType, SL, VA.getLocVT(), Chain, FIN,
|
||||
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
|
||||
MemVT);
|
||||
return ArgValue;
|
||||
}
|
||||
|
||||
static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
|
||||
CallingConv::ID CallConv,
|
||||
ArrayRef<ISD::InputArg> Ins,
|
||||
@ -1094,10 +1143,12 @@ static void allocateSystemSGPRs(CCState &CCInfo,
|
||||
static void reservePrivateMemoryRegs(const TargetMachine &TM,
|
||||
MachineFunction &MF,
|
||||
const SIRegisterInfo &TRI,
|
||||
SIMachineFunctionInfo &Info) {
|
||||
SIMachineFunctionInfo &Info,
|
||||
bool NeedSP) {
|
||||
// Now that we've figured out where the scratch register inputs are, see if
|
||||
// should reserve the arguments and use them directly.
|
||||
bool HasStackObjects = MF.getFrameInfo().hasStackObjects();
|
||||
MachineFrameInfo &MFI = MF.getFrameInfo();
|
||||
bool HasStackObjects = MFI.hasStackObjects();
|
||||
|
||||
// Record that we know we have non-spill stack objects so we don't need to
|
||||
// check all stack objects later.
|
||||
@ -1155,6 +1206,15 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
|
||||
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
|
||||
}
|
||||
}
|
||||
|
||||
if (NeedSP){
|
||||
unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF);
|
||||
Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
|
||||
|
||||
assert(Info.getStackPtrOffsetReg() != Info.getFrameOffsetReg());
|
||||
assert(!TRI.isSubRegister(Info.getScratchRSrcReg(),
|
||||
Info.getStackPtrOffsetReg()));
|
||||
}
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::LowerFormalArguments(
|
||||
@ -1223,8 +1283,10 @@ SDValue SITargetLowering::LowerFormalArguments(
|
||||
!Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
|
||||
!Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
|
||||
!Info->hasWorkItemIDZ());
|
||||
} else if (IsKernel) {
|
||||
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
|
||||
} else {
|
||||
assert(!IsKernel || (Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()));
|
||||
Splits.append(Ins.begin(), Ins.end());
|
||||
}
|
||||
|
||||
if (IsEntryFunc) {
|
||||
@ -1278,11 +1340,14 @@ SDValue SITargetLowering::LowerFormalArguments(
|
||||
|
||||
InVals.push_back(Arg);
|
||||
continue;
|
||||
} else if (!IsEntryFunc && VA.isMemLoc()) {
|
||||
SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
|
||||
InVals.push_back(Val);
|
||||
if (!Arg.Flags.isByVal())
|
||||
Chains.push_back(Val.getValue(1));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (VA.isMemLoc())
|
||||
report_fatal_error("memloc not supported with calling convention");
|
||||
|
||||
assert(VA.isRegLoc() && "Parameter must be in a register!");
|
||||
|
||||
unsigned Reg = VA.getLocReg();
|
||||
@ -1291,7 +1356,7 @@ SDValue SITargetLowering::LowerFormalArguments(
|
||||
Reg = MF.addLiveIn(Reg, RC);
|
||||
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
|
||||
|
||||
if (Arg.VT.isVector()) {
|
||||
if (IsShader && Arg.VT.isVector()) {
|
||||
// Build a vector from the registers
|
||||
Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
|
||||
unsigned NumElements = ParamType->getVectorNumElements();
|
||||
@ -1317,16 +1382,49 @@ SDValue SITargetLowering::LowerFormalArguments(
|
||||
InVals.push_back(Val);
|
||||
}
|
||||
|
||||
// Start adding system SGPRs.
|
||||
if (IsEntryFunc)
|
||||
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
|
||||
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
|
||||
|
||||
reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
|
||||
// TODO: Could maybe omit SP if only tail calls?
|
||||
bool NeedSP = FrameInfo.hasCalls() || FrameInfo.hasVarSizedObjects();
|
||||
|
||||
// Start adding system SGPRs.
|
||||
if (IsEntryFunc) {
|
||||
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
|
||||
reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info, NeedSP);
|
||||
} else {
|
||||
CCInfo.AllocateReg(Info->getScratchRSrcReg());
|
||||
CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
|
||||
CCInfo.AllocateReg(Info->getFrameOffsetReg());
|
||||
|
||||
if (NeedSP) {
|
||||
unsigned StackPtrReg = findFirstFreeSGPR(CCInfo);
|
||||
CCInfo.AllocateReg(StackPtrReg);
|
||||
Info->setStackPtrOffsetReg(StackPtrReg);
|
||||
}
|
||||
}
|
||||
|
||||
return Chains.empty() ? Chain :
|
||||
DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
|
||||
}
|
||||
|
||||
// TODO: If return values can't fit in registers, we should return as many as
|
||||
// possible in registers before passing on stack.
|
||||
bool SITargetLowering::CanLowerReturn(
|
||||
CallingConv::ID CallConv,
|
||||
MachineFunction &MF, bool IsVarArg,
|
||||
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
||||
LLVMContext &Context) const {
|
||||
// Replacing returns with sret/stack usage doesn't make sense for shaders.
|
||||
// FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
|
||||
// for shaders. Vector types should be explicitly handled by CC.
|
||||
if (AMDGPU::isEntryFunctionCC(CallConv))
|
||||
return true;
|
||||
|
||||
SmallVector<CCValAssign, 16> RVLocs;
|
||||
CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
|
||||
return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
|
||||
}
|
||||
|
||||
SDValue
|
||||
SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
||||
bool isVarArg,
|
||||
@ -1336,11 +1434,15 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
|
||||
|
||||
if (!AMDGPU::isShader(CallConv))
|
||||
if (AMDGPU::isKernel(CallConv)) {
|
||||
return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
|
||||
OutVals, DL, DAG);
|
||||
}
|
||||
|
||||
bool IsShader = AMDGPU::isShader(CallConv);
|
||||
|
||||
Info->setIfReturnsVoid(Outs.size() == 0);
|
||||
bool IsWaveEnd = Info->returnsVoid() && IsShader;
|
||||
|
||||
SmallVector<ISD::OutputArg, 48> Splits;
|
||||
SmallVector<SDValue, 48> SplitVals;
|
||||
@ -1349,7 +1451,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
||||
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
|
||||
const ISD::OutputArg &Out = Outs[i];
|
||||
|
||||
if (Out.VT.isVector()) {
|
||||
if (IsShader && Out.VT.isVector()) {
|
||||
MVT VT = Out.VT.getVectorElementType();
|
||||
ISD::OutputArg NewOut = Out;
|
||||
NewOut.Flags.setSplit();
|
||||
@ -1380,29 +1482,58 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
||||
*DAG.getContext());
|
||||
|
||||
// Analyze outgoing return values.
|
||||
AnalyzeReturn(CCInfo, Splits);
|
||||
CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
|
||||
|
||||
SDValue Flag;
|
||||
SmallVector<SDValue, 48> RetOps;
|
||||
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
|
||||
|
||||
// Add return address for callable functions.
|
||||
if (!Info->isEntryFunction()) {
|
||||
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
|
||||
SDValue ReturnAddrReg = CreateLiveInRegister(
|
||||
DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
|
||||
|
||||
// FIXME: Should be able to use a vreg here, but need a way to prevent it
|
||||
// from being allcoated to a CSR.
|
||||
|
||||
SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
|
||||
MVT::i64);
|
||||
|
||||
Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
|
||||
Flag = Chain.getValue(1);
|
||||
|
||||
RetOps.push_back(PhysReturnAddrReg);
|
||||
}
|
||||
|
||||
// Copy the result values into the output registers.
|
||||
for (unsigned i = 0, realRVLocIdx = 0;
|
||||
i != RVLocs.size();
|
||||
++i, ++realRVLocIdx) {
|
||||
CCValAssign &VA = RVLocs[i];
|
||||
assert(VA.isRegLoc() && "Can only return in registers!");
|
||||
// TODO: Partially return in registers if return values don't fit.
|
||||
|
||||
SDValue Arg = SplitVals[realRVLocIdx];
|
||||
|
||||
// Copied from other backends.
|
||||
switch (VA.getLocInfo()) {
|
||||
default: llvm_unreachable("Unknown loc info!");
|
||||
case CCValAssign::Full:
|
||||
break;
|
||||
case CCValAssign::BCvt:
|
||||
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
|
||||
break;
|
||||
case CCValAssign::SExt:
|
||||
Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
|
||||
break;
|
||||
case CCValAssign::ZExt:
|
||||
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
|
||||
break;
|
||||
case CCValAssign::AExt:
|
||||
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
|
||||
break;
|
||||
default:
|
||||
llvm_unreachable("Unknown loc info!");
|
||||
}
|
||||
|
||||
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
|
||||
@ -1410,12 +1541,16 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
||||
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
|
||||
}
|
||||
|
||||
// FIXME: Does sret work properly?
|
||||
|
||||
// Update chain and glue.
|
||||
RetOps[0] = Chain;
|
||||
if (Flag.getNode())
|
||||
RetOps.push_back(Flag);
|
||||
|
||||
unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN_TO_EPILOG;
|
||||
unsigned Opc = AMDGPUISD::ENDPGM;
|
||||
if (!IsWaveEnd)
|
||||
Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
|
||||
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
|
||||
}
|
||||
|
||||
|
@ -28,6 +28,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
|
||||
uint64_t Offset, bool Signed,
|
||||
const ISD::InputArg *Arg = nullptr) const;
|
||||
|
||||
SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
|
||||
const SDLoc &SL, SDValue Chain,
|
||||
const ISD::InputArg &Arg) const;
|
||||
|
||||
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
|
||||
SelectionDAG &DAG) const override;
|
||||
SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
|
||||
@ -177,7 +181,12 @@ public:
|
||||
const SDLoc &DL, SelectionDAG &DAG,
|
||||
SmallVectorImpl<SDValue> &InVals) const override;
|
||||
|
||||
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
|
||||
bool CanLowerReturn(CallingConv::ID CallConv,
|
||||
MachineFunction &MF, bool isVarArg,
|
||||
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
||||
LLVMContext &Context) const override;
|
||||
|
||||
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
|
||||
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
||||
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
|
||||
SelectionDAG &DAG) const override;
|
||||
|
@ -80,17 +80,22 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
||||
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
|
||||
WavesPerEU = ST.getWavesPerEU(*F);
|
||||
|
||||
// Non-entry functions have no special inputs for now.
|
||||
// TODO: Return early for non-entry CCs.
|
||||
if (!isEntryFunction()) {
|
||||
// Non-entry functions have no special inputs for now, other registers
|
||||
// required for scratch access.
|
||||
ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
|
||||
ScratchWaveOffsetReg = AMDGPU::SGPR4;
|
||||
FrameOffsetReg = AMDGPU::SGPR5;
|
||||
return;
|
||||
}
|
||||
|
||||
CallingConv::ID CC = F->getCallingConv();
|
||||
if (CC == CallingConv::AMDGPU_PS)
|
||||
PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
|
||||
|
||||
if (AMDGPU::isKernel(CC)) {
|
||||
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
|
||||
KernargSegmentPtr = true;
|
||||
WorkGroupIDX = true;
|
||||
WorkItemIDX = true;
|
||||
} else if (CC == CallingConv::AMDGPU_PS) {
|
||||
PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
|
||||
}
|
||||
|
||||
if (ST.debuggerEmitPrologue()) {
|
||||
@ -120,7 +125,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
||||
|
||||
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
|
||||
bool MaySpill = ST.isVGPRSpillingEnabled(*F);
|
||||
bool HasStackObjects = FrameInfo.hasStackObjects();
|
||||
bool HasStackObjects = FrameInfo.hasStackObjects() || FrameInfo.hasCalls();
|
||||
|
||||
if (HasStackObjects || MaySpill) {
|
||||
PrivateSegmentWaveByteOffset = true;
|
||||
|
@ -388,9 +388,8 @@ public:
|
||||
void setScratchWaveOffsetReg(unsigned Reg) {
|
||||
assert(Reg != AMDGPU::NoRegister && "Should never be unset");
|
||||
ScratchWaveOffsetReg = Reg;
|
||||
|
||||
// FIXME: Only for entry functions.
|
||||
FrameOffsetReg = ScratchWaveOffsetReg;
|
||||
if (isEntryFunction())
|
||||
FrameOffsetReg = ScratchWaveOffsetReg;
|
||||
}
|
||||
|
||||
unsigned getQueuePtrUserSGPR() const {
|
||||
|
@ -117,11 +117,7 @@ unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
|
||||
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
|
||||
const MachineFunction &MF) const {
|
||||
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
unsigned RegCount = ST.getMaxNumSGPRs(MF);
|
||||
static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
|
||||
unsigned Reg;
|
||||
|
||||
// Try to place it in a hole after PrivateSegmentBufferReg.
|
||||
@ -134,9 +130,22 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
|
||||
// wave offset before it.
|
||||
Reg = RegCount - 5;
|
||||
}
|
||||
|
||||
return Reg;
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
|
||||
const MachineFunction &MF) const {
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
|
||||
return AMDGPU::SGPR_32RegClass.getRegister(Reg);
|
||||
}
|
||||
|
||||
unsigned SIRegisterInfo::reservedStackPtrOffsetReg(
|
||||
const MachineFunction &MF) const {
|
||||
return AMDGPU::SGPR32;
|
||||
}
|
||||
|
||||
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
|
||||
BitVector Reserved(getNumRegs());
|
||||
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
|
||||
@ -198,15 +207,33 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
|
||||
assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
|
||||
}
|
||||
|
||||
unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
|
||||
if (StackPtrReg != AMDGPU::NoRegister) {
|
||||
reserveRegisterTuples(Reserved, StackPtrReg);
|
||||
assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
|
||||
}
|
||||
|
||||
unsigned FrameReg = MFI->getFrameOffsetReg();
|
||||
if (FrameReg != AMDGPU::NoRegister) {
|
||||
reserveRegisterTuples(Reserved, FrameReg);
|
||||
assert(!isSubRegister(ScratchRSrcReg, FrameReg));
|
||||
}
|
||||
|
||||
return Reserved;
|
||||
}
|
||||
|
||||
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
|
||||
return Fn.getFrameInfo().hasStackObjects();
|
||||
const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
|
||||
if (Info->isEntryFunction()) {
|
||||
const MachineFrameInfo &MFI = Fn.getFrameInfo();
|
||||
return MFI.hasStackObjects() || MFI.hasCalls();
|
||||
}
|
||||
|
||||
// May need scavenger for dealing with callee saved registers.
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
|
||||
bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
|
||||
return MF.getFrameInfo().hasStackObjects();
|
||||
}
|
||||
|
||||
|
@ -17,6 +17,7 @@
|
||||
|
||||
#include "AMDGPURegisterInfo.h"
|
||||
#include "SIDefines.h"
|
||||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
|
||||
namespace llvm {
|
||||
@ -57,8 +58,16 @@ public:
|
||||
unsigned reservedPrivateSegmentWaveByteOffsetReg(
|
||||
const MachineFunction &MF) const;
|
||||
|
||||
unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const;
|
||||
|
||||
BitVector getReservedRegs(const MachineFunction &MF) const override;
|
||||
|
||||
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
|
||||
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
|
||||
CallingConv::ID) const override;
|
||||
|
||||
unsigned getFrameRegister(const MachineFunction &MF) const override;
|
||||
|
||||
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
|
||||
|
||||
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
|
||||
@ -228,6 +237,11 @@ public:
|
||||
|
||||
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
|
||||
|
||||
unsigned getReturnAddressReg(const MachineFunction &MF) const {
|
||||
// Not a callee saved register.
|
||||
return AMDGPU::SGPR30_SGPR31;
|
||||
}
|
||||
|
||||
private:
|
||||
void buildSpillLoadStore(MachineBasicBlock::iterator MI,
|
||||
unsigned LoadStoreOp,
|
||||
|
@ -186,11 +186,23 @@ def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">;
|
||||
def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">;
|
||||
def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">;
|
||||
|
||||
let isTerminator = 1, isBarrier = 1,
|
||||
isBranch = 1, isIndirectBranch = 1 in {
|
||||
let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in {
|
||||
|
||||
let isBranch = 1, isIndirectBranch = 1 in {
|
||||
def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">;
|
||||
} // End isBranch = 1, isIndirectBranch = 1
|
||||
|
||||
let isReturn = 1 in {
|
||||
// Define variant marked as return rather than branch.
|
||||
def S_SETPC_B64_return : SOP1_1<"", [(AMDGPUret_flag i64:$src0)]>;
|
||||
}
|
||||
def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64">;
|
||||
} // End isTerminator = 1, isBarrier = 1
|
||||
|
||||
let isCall = 1 in {
|
||||
def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64"
|
||||
>;
|
||||
}
|
||||
|
||||
def S_RFE_B64 : SOP1_1 <"s_rfe_b64">;
|
||||
|
||||
let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
|
||||
|
@ -518,7 +518,18 @@ bool isCompute(CallingConv::ID cc) {
|
||||
}
|
||||
|
||||
bool isEntryFunctionCC(CallingConv::ID CC) {
|
||||
return true;
|
||||
switch (CC) {
|
||||
case CallingConv::AMDGPU_KERNEL:
|
||||
case CallingConv::SPIR_KERNEL:
|
||||
case CallingConv::AMDGPU_VS:
|
||||
case CallingConv::AMDGPU_GS:
|
||||
case CallingConv::AMDGPU_PS:
|
||||
case CallingConv::AMDGPU_CS:
|
||||
case CallingConv::AMDGPU_HS:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool isSI(const MCSubtargetInfo &STI) {
|
||||
|
@ -262,7 +262,6 @@ bool isEntryFunctionCC(CallingConv::ID CC);
|
||||
LLVM_READNONE
|
||||
inline bool isKernel(CallingConv::ID CC) {
|
||||
switch (CC) {
|
||||
case CallingConv::C:
|
||||
case CallingConv::AMDGPU_KERNEL:
|
||||
case CallingConv::SPIR_KERNEL:
|
||||
return true;
|
||||
|
@ -6,7 +6,8 @@
|
||||
; Tests for add.
|
||||
; CHECK: name: addi32
|
||||
; CHECK: {{%[0-9]+}}(s32) = G_ADD
|
||||
define i32 @addi32(i32 %arg1, i32 %arg2) {
|
||||
define amdgpu_kernel void @addi32(i32 %arg1, i32 %arg2) {
|
||||
%res = add i32 %arg1, %arg2
|
||||
ret i32 %res
|
||||
store i32 %res, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
124
test/CodeGen/AMDGPU/frame-index-elimination.ll
Normal file
124
test/CodeGen/AMDGPU/frame-index-elimination.ll
Normal file
@ -0,0 +1,124 @@
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
; Test that non-entry function frame indices are expanded properly to
|
||||
; give an index relative to the scratch wave offset register
|
||||
|
||||
; Materialize into a mov. Make sure there isn't an unnecessary copy.
|
||||
; GCN-LABEL: {{^}}func_mov_fi_i32:
|
||||
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN: s_sub_u32 vcc_hi, s5, s4
|
||||
; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6
|
||||
; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4
|
||||
; GCN-NOT: v_mov
|
||||
; GCN: ds_write_b32 v0, v0
|
||||
define void @func_mov_fi_i32() #0 {
|
||||
%alloca = alloca i32
|
||||
store volatile i32* %alloca, i32* addrspace(3)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; Materialize into an add of a constant offset from the FI.
|
||||
; FIXME: Should be able to merge adds
|
||||
|
||||
; GCN-LABEL: {{^}}func_add_constant_to_fi_i32:
|
||||
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN: s_sub_u32 s6, s5, s4
|
||||
; GCN-NEXT: s_lshr_b32 s6, s6, 6
|
||||
; GCN-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, s6, 4
|
||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
|
||||
; GCN-NOT: v_mov
|
||||
; GCN: ds_write_b32 v0, v0
|
||||
define void @func_add_constant_to_fi_i32() #0 {
|
||||
%alloca = alloca [2 x i32], align 4
|
||||
%gep0 = getelementptr inbounds [2 x i32], [2 x i32]* %alloca, i32 0, i32 1
|
||||
store volatile i32* %gep0, i32* addrspace(3)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; A user the materialized frame index can't be meaningfully folded
|
||||
; into.
|
||||
|
||||
; GCN-LABEL: {{^}}func_other_fi_user_i32:
|
||||
; GCN: s_sub_u32 vcc_hi, s5, s4
|
||||
; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6
|
||||
; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4
|
||||
; GCN-NEXT: v_mul_lo_i32 v0, v0, 9
|
||||
; GCN-NOT: v_mov
|
||||
; GCN: ds_write_b32 v0, v0
|
||||
define void @func_other_fi_user_i32() #0 {
|
||||
%alloca = alloca [2 x i32], align 4
|
||||
%ptrtoint = ptrtoint [2 x i32]* %alloca to i32
|
||||
%mul = mul i32 %ptrtoint, 9
|
||||
store volatile i32 %mul, i32 addrspace(3)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr:
|
||||
; GCN: v_mov_b32_e32 v1, 15{{$}}
|
||||
; GCN: buffer_store_dword v1, v0, s[0:3], s4 offen{{$}}
|
||||
define void @func_store_private_arg_i32_ptr(i32* %ptr) #0 {
|
||||
store volatile i32 15, i32* %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s4 offen{{$}}
|
||||
define void @func_load_private_arg_i32_ptr(i32* %ptr) #0 {
|
||||
%val = load volatile i32, i32* %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: s_sub_u32 s6, s5, s4
|
||||
; GCN-NEXT: v_lshr_b32_e64 v0, s6, 6
|
||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
|
||||
; GCN-NOT: v_mov
|
||||
; GCN: ds_write_b32 v0, v0
|
||||
define void @void_func_byval_struct_i8_i32_ptr({ i8, i32 }* byval %arg0) #0 {
|
||||
%gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
|
||||
%load1 = load i32, i32* %gep1
|
||||
store volatile i32* %gep1, i32* addrspace(3)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value:
|
||||
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s5
|
||||
; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4
|
||||
define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 }* byval %arg0) #0 {
|
||||
%gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
|
||||
%load0 = load i8, i8* %gep0
|
||||
%load1 = load i32, i32* %gep1
|
||||
store volatile i8 %load0, i8 addrspace(3)* undef
|
||||
store volatile i32 %load1, i32 addrspace(3)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
|
||||
; GCN: s_sub_u32 s8, s5, s4
|
||||
; GCN: v_lshr_b32_e64 v1, s8, 6
|
||||
; GCN: s_and_saveexec_b64
|
||||
|
||||
; GCN: v_add_i32_e32 v0, vcc, 4, v1
|
||||
; GCN: buffer_load_dword v1, v1, s[0:3], s4 offen offset:4
|
||||
; GCN: ds_write_b32
|
||||
define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 }* byval %arg0, i32 %arg2) #0 {
|
||||
%cmp = icmp eq i32 %arg2, 0
|
||||
br i1 %cmp, label %bb, label %ret
|
||||
|
||||
bb:
|
||||
%gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
|
||||
%load1 = load volatile i32, i32* %gep1
|
||||
store volatile i32* %gep1, i32* addrspace(3)* undef
|
||||
br label %ret
|
||||
|
||||
ret:
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
734
test/CodeGen/AMDGPU/function-args.ll
Normal file
734
test/CodeGen/AMDGPU/function-args.ll
Normal file
@ -0,0 +1,734 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_i1:
|
||||
; GCN: v_and_b32_e32 v0, 1, v0
|
||||
; GCN: buffer_store_byte v0, off
|
||||
define void @void_func_i1(i1 %arg0) #0 {
|
||||
store i1 %arg0, i1 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_i1_zeroext:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: v_or_b32_e32 v0, 12, v0
|
||||
; GCN-NOT: v0
|
||||
; GCN: buffer_store_dword v0, off
|
||||
define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
|
||||
%ext = zext i1 %arg0 to i32
|
||||
%add = add i32 %ext, 12
|
||||
store i32 %add, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_i1_signext:
|
||||
; GCN: s_waitcnt
|
||||
; GCN-NEXT: v_add_i32_e32 v0, vcc, 12, v0
|
||||
; GCN-NOT: v0
|
||||
; GCN: buffer_store_dword v0, off
|
||||
define void @void_func_i1_signext(i1 signext %arg0) #0 {
|
||||
%ext = sext i1 %arg0 to i32
|
||||
%add = add i32 %ext, 12
|
||||
store i32 %add, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_i8:
|
||||
; GCN-NOT: v0
|
||||
; GCN: buffer_store_byte v0, off
|
||||
define void @void_func_i8(i8 %arg0) #0 {
|
||||
store i8 %arg0, i8 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_i8_zeroext:
|
||||
; GCN-NOT: and_b32
|
||||
; GCN: v_add_i32_e32 v0, vcc, 12, v0
|
||||
define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 {
|
||||
%ext = zext i8 %arg0 to i32
|
||||
%add = add i32 %ext, 12
|
||||
store i32 %add, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_i8_signext:
|
||||
; GCN-NOT: v_bfe_i32
|
||||
; GCN: v_add_i32_e32 v0, vcc, 12, v0
|
||||
define void @void_func_i8_signext(i8 signext %arg0) #0 {
|
||||
%ext = sext i8 %arg0 to i32
|
||||
%add = add i32 %ext, 12
|
||||
store i32 %add, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_i16:
|
||||
; GCN: buffer_store_short v0, off
|
||||
define void @void_func_i16(i16 %arg0) #0 {
|
||||
store i16 %arg0, i16 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_i16_zeroext:
|
||||
; GCN-NOT: v0
|
||||
; GCN: v_add_i32_e32 v0, vcc, 12, v0
|
||||
define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 {
|
||||
%ext = zext i16 %arg0 to i32
|
||||
%add = add i32 %ext, 12
|
||||
store i32 %add, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_i16_signext:
|
||||
; GCN-NOT: v0
|
||||
; GCN: v_add_i32_e32 v0, vcc, 12, v0
|
||||
define void @void_func_i16_signext(i16 signext %arg0) #0 {
|
||||
%ext = sext i16 %arg0 to i32
|
||||
%add = add i32 %ext, 12
|
||||
store i32 %add, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_i32:
|
||||
; GCN-NOT: v0
|
||||
; GCN: buffer_store_dword v0, off
|
||||
define void @void_func_i32(i32 %arg0) #0 {
|
||||
store i32 %arg0, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_i64:
|
||||
; GCN-NOT: v[0:1]
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; GCN: buffer_store_dwordx2 v[0:1], off
|
||||
define void @void_func_i64(i64 %arg0) #0 {
|
||||
store i64 %arg0, i64 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_f16:
|
||||
; VI-NOT: v0
|
||||
; CI: v_cvt_f16_f32_e32 v0, v0
|
||||
; GCN: buffer_store_short v0, off
|
||||
define void @void_func_f16(half %arg0) #0 {
|
||||
store half %arg0, half addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_f32
|
||||
; GCN-NOT: v0
|
||||
; GCN: buffer_store_dword v0, off
|
||||
define void @void_func_f32(float %arg0) #0 {
|
||||
store float %arg0, float addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_f64:
|
||||
; GCN-NOT: v[0:1]
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; GCN: buffer_store_dwordx2 v[0:1], off
|
||||
define void @void_func_f64(double %arg0) #0 {
|
||||
store double %arg0, double addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v2i32:
|
||||
; GCN-NOT: v[0:1]
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; GCN: buffer_store_dwordx2 v[0:1], off
|
||||
define void @void_func_v2i32(<2 x i32> %arg0) #0 {
|
||||
store <2 x i32> %arg0, <2 x i32> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v3i32:
|
||||
; GCN-DAG: buffer_store_dword v2, off
|
||||
; GCN-DAG: buffer_store_dwordx2 v[0:1], off
|
||||
define void @void_func_v3i32(<3 x i32> %arg0) #0 {
|
||||
store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v4i32:
|
||||
; GCN: buffer_store_dwordx4 v[0:3], off
|
||||
define void @void_func_v4i32(<4 x i32> %arg0) #0 {
|
||||
store <4 x i32> %arg0, <4 x i32> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v5i32:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dword v4, off
|
||||
define void @void_func_v5i32(<5 x i32> %arg0) #0 {
|
||||
store <5 x i32> %arg0, <5 x i32> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v8i32:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
define void @void_func_v8i32(<8 x i32> %arg0) #0 {
|
||||
store <8 x i32> %arg0, <8 x i32> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v16i32:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
|
||||
define void @void_func_v16i32(<16 x i32> %arg0) #0 {
|
||||
store <16 x i32> %arg0, <16 x i32> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v32i32:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[16:19], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[20:23], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[24:27], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[28:31], off
|
||||
define void @void_func_v32i32(<32 x i32> %arg0) #0 {
|
||||
store <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; 1 over register limit
|
||||
; GCN-LABEL: {{^}}void_func_v33i32:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
|
||||
; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s5
|
||||
; GCN-DAG: buffer_store_dwordx4 v[16:19], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[20:23], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[24:27], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[28:31], off
|
||||
; GCN: buffer_store_dword [[STACKLOAD]], off
|
||||
define void @void_func_v33i32(<33 x i32> %arg0) #0 {
|
||||
store <33 x i32> %arg0, <33 x i32> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v2i64:
|
||||
; GCN: buffer_store_dwordx4 v[0:3], off
|
||||
define void @void_func_v2i64(<2 x i64> %arg0) #0 {
|
||||
store <2 x i64> %arg0, <2 x i64> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v3i64:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx2 v[4:5], off
|
||||
define void @void_func_v3i64(<3 x i64> %arg0) #0 {
|
||||
store <3 x i64> %arg0, <3 x i64> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v4i64:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
define void @void_func_v4i64(<4 x i64> %arg0) #0 {
|
||||
store <4 x i64> %arg0, <4 x i64> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v5i64:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_store_dwordx2 v[8:9], off
|
||||
define void @void_func_v5i64(<5 x i64> %arg0) #0 {
|
||||
store <5 x i64> %arg0, <5 x i64> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v8i64:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
|
||||
define void @void_func_v8i64(<8 x i64> %arg0) #0 {
|
||||
store <8 x i64> %arg0, <8 x i64> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v16i64:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[16:19], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[20:23], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[24:27], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[28:31], off
|
||||
define void @void_func_v16i64(<16 x i64> %arg0) #0 {
|
||||
store <16 x i64> %arg0, <16 x i64> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v2i16:
|
||||
; GFX9-NOT: v0
|
||||
; GFX9: buffer_store_dword v0, off
|
||||
define void @void_func_v2i16(<2 x i16> %arg0) #0 {
|
||||
store <2 x i16> %arg0, <2 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v3i16:
|
||||
; GCN-DAG: buffer_store_dword v0, off
|
||||
; GCN-DAG: buffer_store_short v2, off
|
||||
define void @void_func_v3i16(<3 x i16> %arg0) #0 {
|
||||
store <3 x i16> %arg0, <3 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v4i16:
|
||||
; GFX9-NOT: v0
|
||||
; GFX9-NOT: v1
|
||||
; GFX9: buffer_store_dwordx2 v[0:1], off
|
||||
define void @void_func_v4i16(<4 x i16> %arg0) #0 {
|
||||
store <4 x i16> %arg0, <4 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v5i16:
|
||||
; GCN-DAG: buffer_store_short v4, off,
|
||||
; GCN-DAG: buffer_store_dwordx2 v[1:2], off
|
||||
define void @void_func_v5i16(<5 x i16> %arg0) #0 {
|
||||
store <5 x i16> %arg0, <5 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v8i16:
|
||||
; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
define void @void_func_v8i16(<8 x i16> %arg0) #0 {
|
||||
store <8 x i16> %arg0, <8 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v16i16:
|
||||
; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GFX9-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
define void @void_func_v16i16(<16 x i16> %arg0) #0 {
|
||||
store <16 x i16> %arg0, <16 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v2f32:
|
||||
; GCN-NOT: v[0:1]
|
||||
; GCN-NOT: v0
|
||||
; GCN-NOT: v1
|
||||
; GCN: buffer_store_dwordx2 v[0:1], off
|
||||
define void @void_func_v2f32(<2 x float> %arg0) #0 {
|
||||
store <2 x float> %arg0, <2 x float> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v3f32:
|
||||
; GCN-DAG: buffer_store_dword v2, off
|
||||
; GCN-DAG: buffer_store_dwordx2 v[0:1], off
|
||||
define void @void_func_v3f32(<3 x float> %arg0) #0 {
|
||||
store <3 x float> %arg0, <3 x float> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v4f32:
|
||||
; GCN: buffer_store_dwordx4 v[0:3], off
|
||||
define void @void_func_v4f32(<4 x float> %arg0) #0 {
|
||||
store <4 x float> %arg0, <4 x float> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v8f32:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
define void @void_func_v8f32(<8 x float> %arg0) #0 {
|
||||
store <8 x float> %arg0, <8 x float> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v16f32:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
|
||||
define void @void_func_v16f32(<16 x float> %arg0) #0 {
|
||||
store <16 x float> %arg0, <16 x float> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v2f64:
|
||||
; GCN: buffer_store_dwordx4 v[0:3], off
|
||||
define void @void_func_v2f64(<2 x double> %arg0) #0 {
|
||||
store <2 x double> %arg0, <2 x double> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v3f64:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx2 v[4:5], off
|
||||
define void @void_func_v3f64(<3 x double> %arg0) #0 {
|
||||
store <3 x double> %arg0, <3 x double> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v4f64:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
define void @void_func_v4f64(<4 x double> %arg0) #0 {
|
||||
store <4 x double> %arg0, <4 x double> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v8f64:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
|
||||
define void @void_func_v8f64(<8 x double> %arg0) #0 {
|
||||
store <8 x double> %arg0, <8 x double> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v16f64:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[16:19], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[20:23], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[24:27], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[28:31], off
|
||||
define void @void_func_v16f64(<16 x double> %arg0) #0 {
|
||||
store <16 x double> %arg0, <16 x double> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v2f16:
|
||||
; GFX9-NOT: v0
|
||||
; GFX9: buffer_store_dword v0, off
|
||||
define void @void_func_v2f16(<2 x half> %arg0) #0 {
|
||||
store <2 x half> %arg0, <2 x half> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v3f16:
|
||||
; GFX9-NOT: v0
|
||||
; GCN-DAG: buffer_store_dword v0, off
|
||||
; GCN-DAG: buffer_store_short v2, off
|
||||
define void @void_func_v3f16(<3 x half> %arg0) #0 {
|
||||
store <3 x half> %arg0, <3 x half> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v4f16:
|
||||
; GFX9-NOT: v0
|
||||
; GFX9-NOT: v1
|
||||
; GFX9-NOT: v[0:1]
|
||||
; GFX9: buffer_store_dwordx2 v[0:1], off
|
||||
define void @void_func_v4f16(<4 x half> %arg0) #0 {
|
||||
store <4 x half> %arg0, <4 x half> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v8f16:
|
||||
; GFX9-NOT: v0
|
||||
; GFX9-NOT: v1
|
||||
; GFX9: buffer_store_dwordx4 v[0:3], off
|
||||
define void @void_func_v8f16(<8 x half> %arg0) #0 {
|
||||
store <8 x half> %arg0, <8 x half> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v16f16:
|
||||
; GFX9-NOT: v0
|
||||
; GFX9-NOT: v1
|
||||
; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GFX9-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
define void @void_func_v16f16(<16 x half> %arg0) #0 {
|
||||
store <16 x half> %arg0, <16 x half> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; Make sure there is no alignment requirement for passed vgprs.
|
||||
; GCN-LABEL: {{^}}void_func_i32_i64_i32:
|
||||
; GCN-NOT: v0
|
||||
; GCN: buffer_store_dword v0, off
|
||||
; GCN: buffer_store_dwordx2 v[1:2]
|
||||
; GCN: buffer_store_dword v3
|
||||
define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 {
|
||||
store volatile i32 %arg0, i32 addrspace(1)* undef
|
||||
store volatile i64 %arg1, i64 addrspace(1)* undef
|
||||
store volatile i32 %arg2, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_struct_i32:
|
||||
; GCN-NOT: v0
|
||||
; GCN: buffer_store_dword v0, off
|
||||
define void @void_func_struct_i32({ i32 } %arg0) #0 {
|
||||
store { i32 } %arg0, { i32 } addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_struct_i8_i32:
|
||||
; GCN-DAG: buffer_store_byte v0, off
|
||||
; GCN-DAG: buffer_store_dword v1, off
|
||||
define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 {
|
||||
store { i8, i32 } %arg0, { i8, i32 } addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32:
|
||||
; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s5{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
|
||||
; GCN-DAG: buffer_store_dword v[[ELT1]]
|
||||
; GCN-DAG: buffer_store_byte v[[ELT0]]
|
||||
define void @void_func_byval_struct_i8_i32({ i8, i32 }* byval %arg0) #0 {
|
||||
%arg0.load = load { i8, i32 }, { i8, i32 }* %arg0
|
||||
store { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2:
|
||||
; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s5{{$}}
|
||||
; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
|
||||
; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
|
||||
; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
|
||||
|
||||
; GCN: ds_write_b32 v0, v0
|
||||
; GCN: s_setpc_b64
|
||||
define void @void_func_byval_struct_i8_i32_x2({ i8, i32 }* byval %arg0, { i8, i32 }* byval %arg1, i32 %arg2) #0 {
|
||||
%arg0.load = load volatile { i8, i32 }, { i8, i32 }* %arg0
|
||||
%arg1.load = load volatile { i8, i32 }, { i8, i32 }* %arg1
|
||||
store volatile { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef
|
||||
store volatile { i8, i32 } %arg1.load, { i8, i32 } addrspace(1)* undef
|
||||
store volatile i32 %arg2, i32 addrspace(3)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_byval_i32_byval_i64:
|
||||
; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s5{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
|
||||
; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off
|
||||
; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ARG1_LOAD0]]:[[ARG1_LOAD1]]{{\]}}, off
|
||||
define void @void_func_byval_i32_byval_i64(i32* byval %arg0, i64* byval %arg1) #0 {
|
||||
%arg0.load = load i32, i32* %arg0
|
||||
%arg1.load = load i64, i64* %arg1
|
||||
store i32 %arg0.load, i32 addrspace(1)* undef
|
||||
store i64 %arg1.load, i64 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v32i32_i32_i64:
|
||||
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[16:19], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[20:23], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[24:27], off
|
||||
; GCN-DAG: buffer_store_dwordx4 v[28:31], off
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s5{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:4
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:8
|
||||
|
||||
; GCN: buffer_store_dword v[[LOAD_ARG1]]
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
|
||||
define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 {
|
||||
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
|
||||
store volatile i32 %arg1, i32 addrspace(1)* undef
|
||||
store volatile i64 %arg2, i64 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Different ext load types on CI vs. VI
|
||||
; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16:
|
||||
; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5{{$}}
|
||||
; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
|
||||
; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:8{{$}}
|
||||
; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:12{{$}}
|
||||
|
||||
; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
|
||||
; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:8{{$}}
|
||||
; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:12{{$}}
|
||||
|
||||
; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]]
|
||||
; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]]
|
||||
|
||||
; GCN: buffer_store_byte [[TRUNC_ARG1_I1]], off
|
||||
; GCN: buffer_store_byte [[LOAD_ARG2]], off
|
||||
; GCN: buffer_store_short [[LOAD_ARG3]], off
|
||||
; VI: buffer_store_short [[LOAD_ARG4]], off
|
||||
|
||||
; CI: buffer_store_short [[CVT_ARG4]], off
|
||||
define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 {
|
||||
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
|
||||
store volatile i1 %arg1, i1 addrspace(1)* undef
|
||||
store volatile i8 %arg2, i8 addrspace(1)* undef
|
||||
store volatile i16 %arg3, i16 addrspace(1)* undef
|
||||
store volatile half %arg4, half addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32:
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
|
||||
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]{{\]}}, off
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
|
||||
define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 {
|
||||
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
|
||||
store volatile <2 x i32> %arg1, <2 x i32> addrspace(1)* undef
|
||||
store volatile <2 x float> %arg2, <2 x float> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v32i32_v2i16_v2f16:
|
||||
; GFX9-DAG: buffer_load_dword [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5{{$}}
|
||||
; GFX9-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
|
||||
; GFX9: buffer_store_dword [[LOAD_ARG1]], off
|
||||
; GFX9: buffer_store_short [[LOAD_ARG2]], off
|
||||
define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 {
|
||||
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
|
||||
store volatile <2 x i16> %arg1, <2 x i16> addrspace(1)* undef
|
||||
store volatile <2 x half> %arg2, <2 x half> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64:
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
|
||||
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
|
||||
|
||||
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
|
||||
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
|
||||
define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 {
|
||||
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
|
||||
store volatile <2 x i64> %arg1, <2 x i64> addrspace(1)* undef
|
||||
store volatile <2 x double> %arg2, <2 x double> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32:
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
|
||||
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
|
||||
|
||||
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
|
||||
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
|
||||
define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 {
|
||||
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
|
||||
store volatile <4 x i32> %arg1, <4 x i32> addrspace(1)* undef
|
||||
store volatile <4 x float> %arg2, <4 x float> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32:
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
|
||||
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:32{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:36{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:40{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:44{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:48{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:52{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:56{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:60{{$}}
|
||||
|
||||
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]{{\]}}, off
|
||||
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
|
||||
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_4]]:[[LOAD_ARG2_7]]{{\]}}, off
|
||||
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
|
||||
define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 x float> %arg2) #0 {
|
||||
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
|
||||
store volatile <8 x i32> %arg1, <8 x i32> addrspace(1)* undef
|
||||
store volatile <8 x float> %arg2, <8 x float> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v32i32_v16i32_v16f32:
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s5 offset:32{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s5 offset:36{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s5 offset:40{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s5 offset:44{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s5 offset:48{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s5 offset:52{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s5 offset:56{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_15:[0-9]+]], off, s[0:3], s5 offset:60{{$}}
|
||||
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:64{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:68{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:72{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:76{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:80{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:84{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:88{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:92{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s5 offset:96{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s5 offset:100{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s5 offset:104{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s5 offset:108{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s5 offset:112{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s5 offset:116{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s5 offset:120{{$}}
|
||||
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s5 offset:124{{$}}
|
||||
define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 {
|
||||
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
|
||||
store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef
|
||||
store volatile <16 x float> %arg2, <16 x float> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; Check there is no crash.
|
||||
; GCN-LABEL: {{^}}void_func_v16i8:
|
||||
define void @void_func_v16i8(<16 x i8> %arg0) #0 {
|
||||
store volatile <16 x i8> %arg0, <16 x i8> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; Check there is no crash.
|
||||
; GCN-LABEL: {{^}}void_func_v32i32_v16i8:
|
||||
define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
|
||||
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
|
||||
store volatile <16 x i8> %arg1, <16 x i8> addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
514
test/CodeGen/AMDGPU/function-returns.ll
Normal file
514
test/CodeGen/AMDGPU/function-returns.ll
Normal file
@ -0,0 +1,514 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
|
||||
; GCN-LABEL: {{^}}i1_func_void:
|
||||
; GCN: buffer_load_ubyte v0, off
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define i1 @i1_func_void() #0 {
|
||||
%val = load i1, i1 addrspace(1)* undef
|
||||
ret i1 %val
|
||||
}
|
||||
|
||||
; FIXME: Missing and?
|
||||
; GCN-LABEL: {{^}}i1_zeroext_func_void:
|
||||
; GCN: buffer_load_ubyte v0, off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define zeroext i1 @i1_zeroext_func_void() #0 {
|
||||
%val = load i1, i1 addrspace(1)* undef
|
||||
ret i1 %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}i1_signext_func_void:
|
||||
; GCN: buffer_load_ubyte v0, off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1{{$}}
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define signext i1 @i1_signext_func_void() #0 {
|
||||
%val = load i1, i1 addrspace(1)* undef
|
||||
ret i1 %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}i8_func_void:
|
||||
; GCN: buffer_load_ubyte v0, off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define i8 @i8_func_void() #0 {
|
||||
%val = load i8, i8 addrspace(1)* undef
|
||||
ret i8 %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}i8_zeroext_func_void:
|
||||
; GCN: buffer_load_ubyte v0, off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define zeroext i8 @i8_zeroext_func_void() #0 {
|
||||
%val = load i8, i8 addrspace(1)* undef
|
||||
ret i8 %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}i8_signext_func_void:
|
||||
; GCN: buffer_load_sbyte v0, off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define signext i8 @i8_signext_func_void() #0 {
|
||||
%val = load i8, i8 addrspace(1)* undef
|
||||
ret i8 %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}i16_func_void:
|
||||
; GCN: buffer_load_ushort v0, off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define i16 @i16_func_void() #0 {
|
||||
%val = load i16, i16 addrspace(1)* undef
|
||||
ret i16 %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}i16_zeroext_func_void:
|
||||
; GCN: buffer_load_ushort v0, off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define zeroext i16 @i16_zeroext_func_void() #0 {
|
||||
%val = load i16, i16 addrspace(1)* undef
|
||||
ret i16 %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}i16_signext_func_void:
|
||||
; GCN: buffer_load_sshort v0, off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define signext i16 @i16_signext_func_void() #0 {
|
||||
%val = load i16, i16 addrspace(1)* undef
|
||||
ret i16 %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}i32_func_void:
|
||||
; GCN: buffer_load_dword v0, off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define i32 @i32_func_void() #0 {
|
||||
%val = load i32, i32 addrspace(1)* undef
|
||||
ret i32 %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}i64_func_void:
|
||||
; GCN: buffer_load_dwordx2 v[0:1], off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define i64 @i64_func_void() #0 {
|
||||
%val = load i64, i64 addrspace(1)* undef
|
||||
ret i64 %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}f32_func_void:
|
||||
; GCN: buffer_load_dword v0, off, s[8:11], 0
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define float @f32_func_void() #0 {
|
||||
%val = load float, float addrspace(1)* undef
|
||||
ret float %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}f64_func_void:
|
||||
; GCN: buffer_load_dwordx2 v[0:1], off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define double @f64_func_void() #0 {
|
||||
%val = load double, double addrspace(1)* undef
|
||||
ret double %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v2i32_func_void:
|
||||
; GCN: buffer_load_dwordx2 v[0:1], off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <2 x i32> @v2i32_func_void() #0 {
|
||||
%val = load <2 x i32>, <2 x i32> addrspace(1)* undef
|
||||
ret <2 x i32> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v3i32_func_void:
|
||||
; GCN: buffer_load_dwordx4 v[0:3], off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <3 x i32> @v3i32_func_void() #0 {
|
||||
%val = load <3 x i32>, <3 x i32> addrspace(1)* undef
|
||||
ret <3 x i32> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v4i32_func_void:
|
||||
; GCN: buffer_load_dwordx4 v[0:3], off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <4 x i32> @v4i32_func_void() #0 {
|
||||
%val = load <4 x i32>, <4 x i32> addrspace(1)* undef
|
||||
ret <4 x i32> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v5i32_func_void:
|
||||
; GCN-DAG: buffer_load_dword v4, off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <5 x i32> @v5i32_func_void() #0 {
|
||||
%val = load volatile <5 x i32>, <5 x i32> addrspace(1)* undef
|
||||
ret <5 x i32> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v8i32_func_void:
|
||||
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <8 x i32> @v8i32_func_void() #0 {
|
||||
%ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef
|
||||
%val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
|
||||
ret <8 x i32> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v16i32_func_void:
|
||||
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[12:15], off
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <16 x i32> @v16i32_func_void() #0 {
|
||||
%ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef
|
||||
%val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
|
||||
ret <16 x i32> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v32i32_func_void:
|
||||
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[12:15], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[16:19], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[20:23], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[24:27], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[28:31], off
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <32 x i32> @v32i32_func_void() #0 {
|
||||
%ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
|
||||
%val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
|
||||
ret <32 x i32> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v2i64_func_void:
|
||||
; GCN: buffer_load_dwordx4 v[0:3], off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <2 x i64> @v2i64_func_void() #0 {
|
||||
%val = load <2 x i64>, <2 x i64> addrspace(1)* undef
|
||||
ret <2 x i64> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v3i64_func_void:
|
||||
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <3 x i64> @v3i64_func_void() #0 {
|
||||
%ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(2)* undef
|
||||
%val = load <3 x i64>, <3 x i64> addrspace(1)* %ptr
|
||||
ret <3 x i64> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v4i64_func_void:
|
||||
; GCN: buffer_load_dwordx4 v[0:3], off
|
||||
; GCN: buffer_load_dwordx4 v[4:7], off
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <4 x i64> @v4i64_func_void() #0 {
|
||||
%ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(2)* undef
|
||||
%val = load <4 x i64>, <4 x i64> addrspace(1)* %ptr
|
||||
ret <4 x i64> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v5i64_func_void:
|
||||
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <5 x i64> @v5i64_func_void() #0 {
|
||||
%ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(2)* undef
|
||||
%val = load <5 x i64>, <5 x i64> addrspace(1)* %ptr
|
||||
ret <5 x i64> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v8i64_func_void:
|
||||
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[12:15], off
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <8 x i64> @v8i64_func_void() #0 {
|
||||
%ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(2)* undef
|
||||
%val = load <8 x i64>, <8 x i64> addrspace(1)* %ptr
|
||||
ret <8 x i64> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v16i64_func_void:
|
||||
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[12:15], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[16:19], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[20:23], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[24:27], off
|
||||
; GCN-DAG: buffer_load_dwordx4 v[28:31], off
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <16 x i64> @v16i64_func_void() #0 {
|
||||
%ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(2)* undef
|
||||
%val = load <16 x i64>, <16 x i64> addrspace(1)* %ptr
|
||||
ret <16 x i64> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v2i16_func_void:
|
||||
; GFX9: buffer_load_dword v0, off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <2 x i16> @v2i16_func_void() #0 {
|
||||
%val = load <2 x i16>, <2 x i16> addrspace(1)* undef
|
||||
ret <2 x i16> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v3i16_func_void:
|
||||
; GFX9: buffer_load_dwordx2 v[0:1], off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <3 x i16> @v3i16_func_void() #0 {
|
||||
%val = load <3 x i16>, <3 x i16> addrspace(1)* undef
|
||||
ret <3 x i16> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v4i16_func_void:
|
||||
; GFX9: buffer_load_dwordx2 v[0:1], off
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <4 x i16> @v4i16_func_void() #0 {
|
||||
%val = load <4 x i16>, <4 x i16> addrspace(1)* undef
|
||||
ret <4 x i16> %val
|
||||
}
|
||||
|
||||
; FIXME: Should not scalarize
|
||||
; GCN-LABEL: {{^}}v5i16_func_void:
|
||||
; GFX9: buffer_load_dwordx2 v[0:1]
|
||||
; GFX9: buffer_load_ushort v4
|
||||
; GFX9: v_lshrrev_b32_e32 v3, 16, v1
|
||||
; GFX9: v_mov_b32_e32 v2, v1
|
||||
; GFX9: v_lshrrev_b32_e32 v3, 16, v0
|
||||
; GCN: s_setpc_b64
|
||||
define <5 x i16> @v5i16_func_void() #0 {
|
||||
%ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(2)* undef
|
||||
%val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr
|
||||
ret <5 x i16> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v8i16_func_void:
|
||||
; GFX9-DAG: buffer_load_dwordx4 v[0:3], off
|
||||
; GFX9: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <8 x i16> @v8i16_func_void() #0 {
|
||||
%ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(2)* undef
|
||||
%val = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
|
||||
ret <8 x i16> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v16i16_func_void:
|
||||
; GFX9: buffer_load_dwordx4 v[0:3], off
|
||||
; GFX9: buffer_load_dwordx4 v[4:7], off
|
||||
; GFX9: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <16 x i16> @v16i16_func_void() #0 {
|
||||
%ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(2)* undef
|
||||
%val = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
|
||||
ret <16 x i16> %val
|
||||
}
|
||||
|
||||
; FIXME: Should pack
|
||||
; GCN-LABEL: {{^}}v16i8_func_void:
|
||||
; GCN-DAG: v12
|
||||
; GCN-DAG: v13
|
||||
; GCN-DAG: v14
|
||||
; GCN-DAG: v15
|
||||
define <16 x i8> @v16i8_func_void() #0 {
|
||||
%ptr = load volatile <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef
|
||||
%val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
|
||||
ret <16 x i8> %val
|
||||
}
|
||||
|
||||
; FIXME: Should pack
|
||||
; GCN-LABEL: {{^}}v4i8_func_void:
|
||||
; GCN: buffer_load_dword v0
|
||||
; GCN-DAG: v_lshrrev_b32_e32 v2, 16, v0
|
||||
; GCN-DAG: v_lshrrev_b32_e32 v3, 24, v0
|
||||
; CI-DAG: v_bfe_u32 v1, v0, 8, 8
|
||||
; VI-DAG: v_lshrrev_b16_e32 v1, 8, v0
|
||||
; GCN: s_setpc_b64
|
||||
define <4 x i8> @v4i8_func_void() #0 {
|
||||
%ptr = load volatile <4 x i8> addrspace(1)*, <4 x i8> addrspace(1)* addrspace(2)* undef
|
||||
%val = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
|
||||
ret <4 x i8> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}struct_i8_i32_func_void:
|
||||
; GCN-DAG: buffer_load_dword v1
|
||||
; GCN-DAG: buffer_load_ubyte v0
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define {i8, i32} @struct_i8_i32_func_void() #0 {
|
||||
%val = load { i8, i32 }, { i8, i32 } addrspace(1)* undef
|
||||
ret { i8, i32 } %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_sret_struct_i8_i32:
|
||||
; GCN: buffer_load_ubyte [[VAL0:v[0-9]+]]
|
||||
; GCN: buffer_load_dword [[VAL1:v[0-9]+]]
|
||||
; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], s4 offen{{$}}
|
||||
; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], s4 offen offset:4{{$}}
|
||||
define void @void_func_sret_struct_i8_i32({ i8, i32 }* sret %arg0) #0 {
|
||||
%val0 = load volatile i8, i8 addrspace(1)* undef
|
||||
%val1 = load volatile i32, i32 addrspace(1)* undef
|
||||
%gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
|
||||
store i8 %val0, i8* %gep0
|
||||
store i32 %val1, i32* %gep1
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v33i32_func_void:
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <33 x i32> @v33i32_func_void() #0 {
|
||||
%ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(2)* undef
|
||||
%val = load <33 x i32>, <33 x i32> addrspace(1)* %ptr
|
||||
ret <33 x i32> %val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}struct_v32i32_i32_func_void:
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
|
||||
%ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(2)* undef
|
||||
%val = load { <32 x i32>, i32 }, { <32 x i32>, i32 } addrspace(1)* %ptr
|
||||
ret { <32 x i32>, i32 }%val
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}struct_i32_v32i32_func_void:
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:132{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:136{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:140{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:144{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:148{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:152{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:156{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:160{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:164{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:168{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:172{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:176{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:180{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:184{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:188{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:192{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:196{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:200{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:204{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:208{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:212{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:216{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:220{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:224{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:228{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:232{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:236{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:240{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:244{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:248{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:252{{$}}
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
|
||||
%ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(2)* undef
|
||||
%val = load { i32, <32 x i32> }, { i32, <32 x i32> } addrspace(1)* %ptr
|
||||
ret { i32, <32 x i32> }%val
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
@ -27,7 +27,7 @@
|
||||
|
||||
; ELF: Symbol {
|
||||
; ELF: Name: simple
|
||||
; ELF: Size: 44
|
||||
; ELF: Size: 48
|
||||
; ELF: Type: Function (0x2)
|
||||
; ELF: }
|
||||
|
||||
@ -41,14 +41,12 @@
|
||||
; HSA: .p2align 2
|
||||
; HSA: {{^}}simple:
|
||||
; HSA-NOT: amd_kernel_code_t
|
||||
|
||||
; FIXME: Check this isn't a kernarg load when calling convention implemented.
|
||||
; XHSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
|
||||
; HSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
|
||||
|
||||
; Make sure we are setting the ATC bit:
|
||||
; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000
|
||||
; HSA-CI: s_mov_b32 s[[HI:[0-9]+]], 0x100f000
|
||||
; On VI+ we also need to set MTYPE = 2
|
||||
; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
|
||||
; HSA-VI: s_mov_b32 s[[HI:[0-9]+]], 0x1100f000
|
||||
; Make sure we generate flat store for HSA
|
||||
; HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
|
||||
|
||||
@ -56,8 +54,9 @@
|
||||
; HSA: .size simple, .Lfunc_end0-simple
|
||||
; HSA: ; Function info:
|
||||
; HSA-NOT: COMPUTE_PGM_RSRC2
|
||||
define void @simple(i32 addrspace(1)* %out) {
|
||||
define void @simple(i32 addrspace(1)* addrspace(2)* %ptr.out) {
|
||||
entry:
|
||||
%out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out
|
||||
store i32 0, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
@ -191,7 +191,7 @@ entry:
|
||||
; CHECK: v_mov_b32_e32 v0, s0
|
||||
; CHECK: v_mov_b32_e32 v1, s1
|
||||
; CHECK: use v[0:1]
|
||||
define void @i64_imm_input_phys_vgpr() {
|
||||
define amdgpu_kernel void @i64_imm_input_phys_vgpr() {
|
||||
entry:
|
||||
call void asm sideeffect "; use $0 ", "{VGPR0_VGPR1}"(i64 123456)
|
||||
ret void
|
||||
|
@ -1,4 +1,12 @@
|
||||
# RUN: llc -o - %s -mtriple=amdgcn--amdhsa -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s
|
||||
--- |
|
||||
|
||||
define amdgpu_kernel void @func0() {
|
||||
ret void
|
||||
}
|
||||
|
||||
...
|
||||
|
||||
---
|
||||
# We should not detect any interference between v0/v1 here and only allocate
|
||||
# sgpr0-sgpr3.
|
||||
|
Loading…
Reference in New Issue
Block a user