1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 18:54:02 +01:00

AMDGPU: Start defining a calling convention

Partially implement callee-side for arguments and return values.
byval doesn't work properly, and most likely sret or other on-stack
return values most as well.

llvm-svn: 303308
This commit is contained in:
Matt Arsenault 2017-05-17 21:56:25 +00:00
parent 374681b328
commit ab4fb8ba2f
29 changed files with 1852 additions and 127 deletions

View File

@ -38,7 +38,8 @@ class AMDGPUCallLowering: public CallLowering {
unsigned VReg) const override;
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<unsigned> VRegs) const override;
CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
};
} // End of namespace llvm;
#endif

View File

@ -13,6 +13,8 @@
// Inversion of CCIfInReg
class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
class CCIfExtend<CCAction A>
: CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
// Calling convention for SI
def CC_SI : CallingConv<[
@ -52,7 +54,7 @@ def CC_SI : CallingConv<[
]>>>
]>;
def RetCC_SI : CallingConv<[
def RetCC_SI_Shader : CallingConv<[
CCIfType<[i32] , CCAssignToReg<[
SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
@ -99,6 +101,52 @@ def CC_AMDGPU_Kernel : CallingConv<[
CCCustom<"allocateKernArg">
]>;
def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs<
(sequence "VGPR%u", 24, 255)
>;
def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
(sequence "VGPR%u", 32, 255)
>;
def CSR_AMDGPU_SGPRs_32_103 : CalleeSavedRegs<
(sequence "SGPR%u", 32, 103)
>;
def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
(add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_103)
>;
// Calling convention for leaf functions
def CC_AMDGPU_Func : CallingConv<[
CCIfByVal<CCPassByVal<4, 4>>,
CCIfType<[i1], CCPromoteToType<i32>>,
CCIfType<[i1, i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>,
CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
]>;
// Calling convention for leaf functions
def RetCC_AMDGPU_Func : CallingConv<[
CCIfType<[i1], CCPromoteToType<i32>>,
CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>
]>;
def CC_AMDGPU : CallingConv<[
CCIf<"static_cast<const AMDGPUSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() >="

View File

@ -76,6 +76,45 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
}
}
// Allocate up to VGPR31.
//
// TODO: Since there are no VGPR alignent requirements would it be better to
// split into individual scalar registers?
static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State) {
switch (LocVT.SimpleTy) {
case MVT::i64:
case MVT::f64:
case MVT::v2i32:
case MVT::v2f32: {
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
&AMDGPU::VReg_64RegClass, 31);
}
case MVT::v4i32:
case MVT::v4f32:
case MVT::v2i64:
case MVT::v2f64: {
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
&AMDGPU::VReg_128RegClass, 29);
}
case MVT::v8i32:
case MVT::v8f32: {
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
&AMDGPU::VReg_256RegClass, 25);
}
case MVT::v16i32:
case MVT::v16f32: {
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
&AMDGPU::VReg_512RegClass, 17);
}
default:
return false;
}
}
#include "AMDGPUGenCallingConv.inc"
// Find a larger type to do a load / store of a vector with.
@ -773,8 +812,43 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
//===---------------------------------------------------------------------===//
CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
bool IsVarArg) const {
return CC_AMDGPU;
bool IsVarArg) {
switch (CC) {
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
return CC_AMDGPU_Kernel;
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_HS:
return CC_AMDGPU;
case CallingConv::C:
case CallingConv::Fast:
return CC_AMDGPU_Func;
default:
report_fatal_error("Unsupported calling convention.");
}
}
CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
bool IsVarArg) {
switch (CC) {
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
return CC_AMDGPU_Kernel;
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_HS:
return RetCC_SI_Shader;
case CallingConv::C:
case CallingConv::Fast:
return RetCC_AMDGPU_Func;
default:
report_fatal_error("Unsupported calling convention.");
}
}
/// The SelectionDAGBuilder will automatically promote function arguments
@ -874,18 +948,15 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
}
}
void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
const SmallVectorImpl<ISD::OutputArg> &Outs) const {
State.AnalyzeReturn(Outs, RetCC_SI);
}
SDValue
AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
SDValue AMDGPUTargetLowering::LowerReturn(
SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
// FIXME: Fails for r600 tests
//assert(!isVarArg && Outs.empty() && OutVals.empty() &&
// "wave terminate should not have return values");
return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
}
@ -896,20 +967,12 @@ AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
/// Selects the correct CCAssignFn for a given CallingConvention value.
CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
bool IsVarArg) {
switch (CC) {
case CallingConv::C:
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
return CC_AMDGPU_Kernel;
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
return CC_AMDGPU;
default:
report_fatal_error("Unsupported calling convention.");
}
return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
}
CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
bool IsVarArg) {
return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
}
SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,

View File

@ -115,9 +115,6 @@ protected:
SmallVectorImpl<SDValue> &Results) const;
void analyzeFormalArgumentsCompute(CCState &State,
const SmallVectorImpl<ISD::InputArg> &Ins) const;
void AnalyzeReturn(CCState &State,
const SmallVectorImpl<ISD::OutputArg> &Outs) const;
public:
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
@ -164,6 +161,8 @@ public:
bool isCheapToSpeculateCtlz() const override;
static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,

View File

@ -380,6 +380,6 @@ def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;

View File

@ -126,9 +126,15 @@ bool AMDGPUMCInstLower::lowerOperand(const MachineOperand &MO,
}
void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
unsigned Opcode = MI->getOpcode();
int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
// FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
// need to select it to the subtarget specific version, and there's no way to
// do that with a single pseudo source operation.
if (Opcode == AMDGPU::S_SETPC_B64_return)
Opcode = AMDGPU::S_SETPC_B64;
int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode);
if (MCOpcode == -1) {
LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "

View File

@ -12,21 +12,6 @@
using namespace llvm;
static bool isEntryFunctionCC(CallingConv::ID CC) {
switch (CC) {
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
return true;
default:
return false;
}
}
AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
MachineFunctionInfo(),
LocalMemoryObjects(),
@ -34,7 +19,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
MaxKernArgAlign(0),
LDSSize(0),
ABIArgOffset(0),
IsEntryFunction(isEntryFunctionCC(MF.getFunction()->getCallingConv())),
IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction()->getCallingConv())),
NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
// FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
// except reserved size is not correctly aligned.

View File

@ -14,6 +14,7 @@
#include "AMDGPURegisterInfo.h"
#include "AMDGPUTargetMachine.h"
#include "SIRegisterInfo.h"
using namespace llvm;
@ -24,18 +25,6 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
// they are not supported at this time.
//===----------------------------------------------------------------------===//
// Dummy to not crash RegisterClassInfo.
static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs(
const MachineFunction *) const {
return &CalleeSavedReg;
}
unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
return AMDGPU::NoRegister;
}
unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
static const unsigned SubRegs[] = {
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
@ -50,3 +39,35 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
#define GET_REGINFO_TARGET_DESC
#include "AMDGPUGenRegisterInfo.inc"
// Forced to be here by one .inc
const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
const MachineFunction *MF) const {
CallingConv::ID CC = MF->getFunction()->getCallingConv();
switch (CC) {
case CallingConv::C:
case CallingConv::Fast:
return CSR_AMDGPU_HighRegs_SaveList;
default: {
// Dummy to not crash RegisterClassInfo.
static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
return &NoCalleeSavedReg;
}
}
}
const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID CC) const {
switch (CC) {
case CallingConv::C:
case CallingConv::Fast:
return CSR_AMDGPU_HighRegs_RegMask;
default:
return nullptr;
}
}
unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
return AMDGPU::NoRegister;
}

View File

@ -30,9 +30,6 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
/// \returns the sub reg enum value for the given \p Channel
/// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
unsigned getSubRegFromChannel(unsigned Channel) const;
const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
unsigned getFrameRegister(const MachineFunction &MF) const override;
};
} // End namespace llvm

View File

@ -56,6 +56,18 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
return Reserved;
}
// Dummy to not crash RegisterClassInfo.
static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
const MachineFunction *) const {
return &CalleeSavedReg;
}
unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
return AMDGPU::NoRegister;
}
unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
}

View File

@ -27,6 +27,8 @@ struct R600RegisterInfo final : public AMDGPURegisterInfo {
R600RegisterInfo();
BitVector getReservedRegs(const MachineFunction &MF) const override;
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
unsigned getFrameRegister(const MachineFunction &MF) const override;
/// \brief get the HW encoding for a register's channel.
unsigned getHWRegChan(unsigned reg) const;

View File

@ -189,8 +189,6 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
// ----
// 13 (+1)
unsigned ReservedRegCount = 13;
if (SPReg != AMDGPU::NoRegister)
++ReservedRegCount;
if (AllSGPRs.size() < ReservedRegCount)
return std::make_pair(ScratchWaveOffsetReg, SPReg);
@ -208,13 +206,6 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
MFI->setScratchWaveOffsetReg(Reg);
ScratchWaveOffsetReg = Reg;
} else {
if (SPReg == AMDGPU::NoRegister)
break;
MRI.replaceRegWith(SPReg, Reg);
MFI->setStackPtrOffsetReg(Reg);
SPReg = Reg;
break;
}
}
@ -223,8 +214,8 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
return std::make_pair(ScratchWaveOffsetReg, SPReg);
}
void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
// Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
// specified.
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
@ -424,6 +415,13 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
}
}
void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
if (MFI->isEntryFunction())
emitEntryFunctionPrologue(MF, MBB);
}
void SIFrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {

View File

@ -26,6 +26,8 @@ public:
AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
~SIFrameLowering() override = default;
void emitEntryFunctionPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const;
void emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const override;
void emitEpilogue(MachineFunction &MF,

View File

@ -914,6 +914,55 @@ SDValue SITargetLowering::lowerKernargMemParameter(
return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
}
SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
const SDLoc &SL, SDValue Chain,
const ISD::InputArg &Arg) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
if (Arg.Flags.isByVal()) {
unsigned Size = Arg.Flags.getByValSize();
int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
return DAG.getFrameIndex(FrameIdx, MVT::i32);
}
unsigned ArgOffset = VA.getLocMemOffset();
unsigned ArgSize = VA.getValVT().getStoreSize();
int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
// Create load nodes to retrieve arguments from the stack.
SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
SDValue ArgValue;
// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
MVT MemVT = VA.getValVT();
switch (VA.getLocInfo()) {
default:
break;
case CCValAssign::BCvt:
MemVT = VA.getLocVT();
break;
case CCValAssign::SExt:
ExtType = ISD::SEXTLOAD;
break;
case CCValAssign::ZExt:
ExtType = ISD::ZEXTLOAD;
break;
case CCValAssign::AExt:
ExtType = ISD::EXTLOAD;
break;
}
ArgValue = DAG.getExtLoad(
ExtType, SL, VA.getLocVT(), Chain, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
MemVT);
return ArgValue;
}
static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
CallingConv::ID CallConv,
ArrayRef<ISD::InputArg> Ins,
@ -1094,10 +1143,12 @@ static void allocateSystemSGPRs(CCState &CCInfo,
static void reservePrivateMemoryRegs(const TargetMachine &TM,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) {
SIMachineFunctionInfo &Info,
bool NeedSP) {
// Now that we've figured out where the scratch register inputs are, see if
// should reserve the arguments and use them directly.
bool HasStackObjects = MF.getFrameInfo().hasStackObjects();
MachineFrameInfo &MFI = MF.getFrameInfo();
bool HasStackObjects = MFI.hasStackObjects();
// Record that we know we have non-spill stack objects so we don't need to
// check all stack objects later.
@ -1155,6 +1206,15 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
Info.setScratchWaveOffsetReg(ReservedOffsetReg);
}
}
if (NeedSP){
unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF);
Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
assert(Info.getStackPtrOffsetReg() != Info.getFrameOffsetReg());
assert(!TRI.isSubRegister(Info.getScratchRSrcReg(),
Info.getStackPtrOffsetReg()));
}
}
SDValue SITargetLowering::LowerFormalArguments(
@ -1223,8 +1283,10 @@ SDValue SITargetLowering::LowerFormalArguments(
!Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
!Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
!Info->hasWorkItemIDZ());
} else if (IsKernel) {
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
} else {
assert(!IsKernel || (Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()));
Splits.append(Ins.begin(), Ins.end());
}
if (IsEntryFunc) {
@ -1278,11 +1340,14 @@ SDValue SITargetLowering::LowerFormalArguments(
InVals.push_back(Arg);
continue;
} else if (!IsEntryFunc && VA.isMemLoc()) {
SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
InVals.push_back(Val);
if (!Arg.Flags.isByVal())
Chains.push_back(Val.getValue(1));
continue;
}
if (VA.isMemLoc())
report_fatal_error("memloc not supported with calling convention");
assert(VA.isRegLoc() && "Parameter must be in a register!");
unsigned Reg = VA.getLocReg();
@ -1291,7 +1356,7 @@ SDValue SITargetLowering::LowerFormalArguments(
Reg = MF.addLiveIn(Reg, RC);
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
if (Arg.VT.isVector()) {
if (IsShader && Arg.VT.isVector()) {
// Build a vector from the registers
Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
unsigned NumElements = ParamType->getVectorNumElements();
@ -1317,16 +1382,49 @@ SDValue SITargetLowering::LowerFormalArguments(
InVals.push_back(Val);
}
// Start adding system SGPRs.
if (IsEntryFunc)
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
// TODO: Could maybe omit SP if only tail calls?
bool NeedSP = FrameInfo.hasCalls() || FrameInfo.hasVarSizedObjects();
// Start adding system SGPRs.
if (IsEntryFunc) {
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info, NeedSP);
} else {
CCInfo.AllocateReg(Info->getScratchRSrcReg());
CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
CCInfo.AllocateReg(Info->getFrameOffsetReg());
if (NeedSP) {
unsigned StackPtrReg = findFirstFreeSGPR(CCInfo);
CCInfo.AllocateReg(StackPtrReg);
Info->setStackPtrOffsetReg(StackPtrReg);
}
}
return Chains.empty() ? Chain :
DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
}
// TODO: If return values can't fit in registers, we should return as many as
// possible in registers before passing on stack.
bool SITargetLowering::CanLowerReturn(
CallingConv::ID CallConv,
MachineFunction &MF, bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const {
// Replacing returns with sret/stack usage doesn't make sense for shaders.
// FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
// for shaders. Vector types should be explicitly handled by CC.
if (AMDGPU::isEntryFunctionCC(CallConv))
return true;
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
}
SDValue
SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
@ -1336,11 +1434,15 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
if (!AMDGPU::isShader(CallConv))
if (AMDGPU::isKernel(CallConv)) {
return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
OutVals, DL, DAG);
}
bool IsShader = AMDGPU::isShader(CallConv);
Info->setIfReturnsVoid(Outs.size() == 0);
bool IsWaveEnd = Info->returnsVoid() && IsShader;
SmallVector<ISD::OutputArg, 48> Splits;
SmallVector<SDValue, 48> SplitVals;
@ -1349,7 +1451,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
const ISD::OutputArg &Out = Outs[i];
if (Out.VT.isVector()) {
if (IsShader && Out.VT.isVector()) {
MVT VT = Out.VT.getVectorElementType();
ISD::OutputArg NewOut = Out;
NewOut.Flags.setSplit();
@ -1380,29 +1482,58 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
*DAG.getContext());
// Analyze outgoing return values.
AnalyzeReturn(CCInfo, Splits);
CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
SDValue Flag;
SmallVector<SDValue, 48> RetOps;
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
// Add return address for callable functions.
if (!Info->isEntryFunction()) {
const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
SDValue ReturnAddrReg = CreateLiveInRegister(
DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
// FIXME: Should be able to use a vreg here, but need a way to prevent it
// from being allcoated to a CSR.
SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
MVT::i64);
Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
Flag = Chain.getValue(1);
RetOps.push_back(PhysReturnAddrReg);
}
// Copy the result values into the output registers.
for (unsigned i = 0, realRVLocIdx = 0;
i != RVLocs.size();
++i, ++realRVLocIdx) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
// TODO: Partially return in registers if return values don't fit.
SDValue Arg = SplitVals[realRVLocIdx];
// Copied from other backends.
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::AExt:
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
break;
default:
llvm_unreachable("Unknown loc info!");
}
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
@ -1410,12 +1541,16 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
// FIXME: Does sret work properly?
// Update chain and glue.
RetOps[0] = Chain;
if (Flag.getNode())
RetOps.push_back(Flag);
unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN_TO_EPILOG;
unsigned Opc = AMDGPUISD::ENDPGM;
if (!IsWaveEnd)
Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}

View File

@ -28,6 +28,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
uint64_t Offset, bool Signed,
const ISD::InputArg *Arg = nullptr) const;
SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
const SDLoc &SL, SDValue Chain,
const ISD::InputArg &Arg) const;
SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
SelectionDAG &DAG) const override;
SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
@ -177,7 +181,12 @@ public:
const SDLoc &DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const override;
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
bool CanLowerReturn(CallingConv::ID CallConv,
MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
LLVMContext &Context) const override;
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
SelectionDAG &DAG) const override;

View File

@ -80,17 +80,22 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
WavesPerEU = ST.getWavesPerEU(*F);
// Non-entry functions have no special inputs for now.
// TODO: Return early for non-entry CCs.
if (!isEntryFunction()) {
// Non-entry functions have no special inputs for now, other registers
// required for scratch access.
ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
ScratchWaveOffsetReg = AMDGPU::SGPR4;
FrameOffsetReg = AMDGPU::SGPR5;
return;
}
CallingConv::ID CC = F->getCallingConv();
if (CC == CallingConv::AMDGPU_PS)
PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
if (AMDGPU::isKernel(CC)) {
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
KernargSegmentPtr = true;
WorkGroupIDX = true;
WorkItemIDX = true;
} else if (CC == CallingConv::AMDGPU_PS) {
PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
}
if (ST.debuggerEmitPrologue()) {
@ -120,7 +125,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
bool MaySpill = ST.isVGPRSpillingEnabled(*F);
bool HasStackObjects = FrameInfo.hasStackObjects();
bool HasStackObjects = FrameInfo.hasStackObjects() || FrameInfo.hasCalls();
if (HasStackObjects || MaySpill) {
PrivateSegmentWaveByteOffset = true;

View File

@ -388,9 +388,8 @@ public:
void setScratchWaveOffsetReg(unsigned Reg) {
assert(Reg != AMDGPU::NoRegister && "Should never be unset");
ScratchWaveOffsetReg = Reg;
// FIXME: Only for entry functions.
FrameOffsetReg = ScratchWaveOffsetReg;
if (isEntryFunction())
FrameOffsetReg = ScratchWaveOffsetReg;
}
unsigned getQueuePtrUserSGPR() const {

View File

@ -117,11 +117,7 @@ unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
}
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
unsigned RegCount = ST.getMaxNumSGPRs(MF);
static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
unsigned Reg;
// Try to place it in a hole after PrivateSegmentBufferReg.
@ -134,9 +130,22 @@ unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
// wave offset before it.
Reg = RegCount - 5;
}
return Reg;
}
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
return AMDGPU::SGPR_32RegClass.getRegister(Reg);
}
unsigned SIRegisterInfo::reservedStackPtrOffsetReg(
const MachineFunction &MF) const {
return AMDGPU::SGPR32;
}
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
@ -198,15 +207,33 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
}
unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
if (StackPtrReg != AMDGPU::NoRegister) {
reserveRegisterTuples(Reserved, StackPtrReg);
assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
}
unsigned FrameReg = MFI->getFrameOffsetReg();
if (FrameReg != AMDGPU::NoRegister) {
reserveRegisterTuples(Reserved, FrameReg);
assert(!isSubRegister(ScratchRSrcReg, FrameReg));
}
return Reserved;
}
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
return Fn.getFrameInfo().hasStackObjects();
const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
if (Info->isEntryFunction()) {
const MachineFrameInfo &MFI = Fn.getFrameInfo();
return MFI.hasStackObjects() || MFI.hasCalls();
}
// May need scavenger for dealing with callee saved registers.
return true;
}
bool
SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
return MF.getFrameInfo().hasStackObjects();
}

View File

@ -17,6 +17,7 @@
#include "AMDGPURegisterInfo.h"
#include "SIDefines.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
namespace llvm {
@ -57,8 +58,16 @@ public:
unsigned reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const;
unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const;
BitVector getReservedRegs(const MachineFunction &MF) const override;
const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
const uint32_t *getCallPreservedMask(const MachineFunction &MF,
CallingConv::ID) const override;
unsigned getFrameRegister(const MachineFunction &MF) const override;
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
@ -228,6 +237,11 @@ public:
const int *getRegUnitPressureSets(unsigned RegUnit) const override;
unsigned getReturnAddressReg(const MachineFunction &MF) const {
// Not a callee saved register.
return AMDGPU::SGPR30_SGPR31;
}
private:
void buildSpillLoadStore(MachineBasicBlock::iterator MI,
unsigned LoadStoreOp,

View File

@ -186,11 +186,23 @@ def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32">;
def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">;
def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">;
let isTerminator = 1, isBarrier = 1,
isBranch = 1, isIndirectBranch = 1 in {
let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in {
let isBranch = 1, isIndirectBranch = 1 in {
def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">;
} // End isBranch = 1, isIndirectBranch = 1
let isReturn = 1 in {
// Define variant marked as return rather than branch.
def S_SETPC_B64_return : SOP1_1<"", [(AMDGPUret_flag i64:$src0)]>;
}
def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64">;
} // End isTerminator = 1, isBarrier = 1
let isCall = 1 in {
def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64"
>;
}
def S_RFE_B64 : SOP1_1 <"s_rfe_b64">;
let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {

View File

@ -518,7 +518,18 @@ bool isCompute(CallingConv::ID cc) {
}
bool isEntryFunctionCC(CallingConv::ID CC) {
return true;
switch (CC) {
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_HS:
return true;
default:
return false;
}
}
bool isSI(const MCSubtargetInfo &STI) {

View File

@ -262,7 +262,6 @@ bool isEntryFunctionCC(CallingConv::ID CC);
LLVM_READNONE
inline bool isKernel(CallingConv::ID CC) {
switch (CC) {
case CallingConv::C:
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
return true;

View File

@ -6,7 +6,8 @@
; Tests for add.
; CHECK: name: addi32
; CHECK: {{%[0-9]+}}(s32) = G_ADD
define i32 @addi32(i32 %arg1, i32 %arg2) {
define amdgpu_kernel void @addi32(i32 %arg1, i32 %arg2) {
%res = add i32 %arg1, %arg2
ret i32 %res
store i32 %res, i32 addrspace(1)* undef
ret void
}

View File

@ -0,0 +1,124 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; Test that non-entry function frame indices are expanded properly to
; give an index relative to the scratch wave offset register
; Materialize into a mov. Make sure there isn't an unnecessary copy.
; GCN-LABEL: {{^}}func_mov_fi_i32:
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN: s_sub_u32 vcc_hi, s5, s4
; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6
; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4
; GCN-NOT: v_mov
; GCN: ds_write_b32 v0, v0
define void @func_mov_fi_i32() #0 {
%alloca = alloca i32
store volatile i32* %alloca, i32* addrspace(3)* undef
ret void
}
; Materialize into an add of a constant offset from the FI.
; FIXME: Should be able to merge adds
; GCN-LABEL: {{^}}func_add_constant_to_fi_i32:
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN: s_sub_u32 s6, s5, s4
; GCN-NEXT: s_lshr_b32 s6, s6, 6
; GCN-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, s6, 4
; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
; GCN-NOT: v_mov
; GCN: ds_write_b32 v0, v0
define void @func_add_constant_to_fi_i32() #0 {
%alloca = alloca [2 x i32], align 4
%gep0 = getelementptr inbounds [2 x i32], [2 x i32]* %alloca, i32 0, i32 1
store volatile i32* %gep0, i32* addrspace(3)* undef
ret void
}
; A user the materialized frame index can't be meaningfully folded
; into.
; GCN-LABEL: {{^}}func_other_fi_user_i32:
; GCN: s_sub_u32 vcc_hi, s5, s4
; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6
; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4
; GCN-NEXT: v_mul_lo_i32 v0, v0, 9
; GCN-NOT: v_mov
; GCN: ds_write_b32 v0, v0
define void @func_other_fi_user_i32() #0 {
%alloca = alloca [2 x i32], align 4
%ptrtoint = ptrtoint [2 x i32]* %alloca to i32
%mul = mul i32 %ptrtoint, 9
store volatile i32 %mul, i32 addrspace(3)* undef
ret void
}
; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr:
; GCN: v_mov_b32_e32 v1, 15{{$}}
; GCN: buffer_store_dword v1, v0, s[0:3], s4 offen{{$}}
define void @func_store_private_arg_i32_ptr(i32* %ptr) #0 {
store volatile i32 15, i32* %ptr
ret void
}
; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr:
; GCN: s_waitcnt
; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s4 offen{{$}}
define void @func_load_private_arg_i32_ptr(i32* %ptr) #0 {
%val = load volatile i32, i32* %ptr
ret void
}
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr:
; GCN: s_waitcnt
; GCN-NEXT: s_sub_u32 s6, s5, s4
; GCN-NEXT: v_lshr_b32_e64 v0, s6, 6
; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
; GCN-NOT: v_mov
; GCN: ds_write_b32 v0, v0
define void @void_func_byval_struct_i8_i32_ptr({ i8, i32 }* byval %arg0) #0 {
%gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
%gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
%load1 = load i32, i32* %gep1
store volatile i32* %gep1, i32* addrspace(3)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value:
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s5
; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4
define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 }* byval %arg0) #0 {
%gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
%gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
%load0 = load i8, i8* %gep0
%load1 = load i32, i32* %gep1
store volatile i8 %load0, i8 addrspace(3)* undef
store volatile i32 %load1, i32 addrspace(3)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
; GCN: s_sub_u32 s8, s5, s4
; GCN: v_lshr_b32_e64 v1, s8, 6
; GCN: s_and_saveexec_b64
; GCN: v_add_i32_e32 v0, vcc, 4, v1
; GCN: buffer_load_dword v1, v1, s[0:3], s4 offen offset:4
; GCN: ds_write_b32
define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 }* byval %arg0, i32 %arg2) #0 {
%cmp = icmp eq i32 %arg2, 0
br i1 %cmp, label %bb, label %ret
bb:
%gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
%gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
%load1 = load volatile i32, i32* %gep1
store volatile i32* %gep1, i32* addrspace(3)* undef
br label %ret
ret:
ret void
}
attributes #0 = { nounwind }

View File

@ -0,0 +1,734 @@
; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}void_func_i1:
; GCN: v_and_b32_e32 v0, 1, v0
; GCN: buffer_store_byte v0, off
define void @void_func_i1(i1 %arg0) #0 {
store i1 %arg0, i1 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_i1_zeroext:
; GCN: s_waitcnt
; GCN-NEXT: v_or_b32_e32 v0, 12, v0
; GCN-NOT: v0
; GCN: buffer_store_dword v0, off
define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
%ext = zext i1 %arg0 to i32
%add = add i32 %ext, 12
store i32 %add, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_i1_signext:
; GCN: s_waitcnt
; GCN-NEXT: v_add_i32_e32 v0, vcc, 12, v0
; GCN-NOT: v0
; GCN: buffer_store_dword v0, off
define void @void_func_i1_signext(i1 signext %arg0) #0 {
%ext = sext i1 %arg0 to i32
%add = add i32 %ext, 12
store i32 %add, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_i8:
; GCN-NOT: v0
; GCN: buffer_store_byte v0, off
define void @void_func_i8(i8 %arg0) #0 {
store i8 %arg0, i8 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_i8_zeroext:
; GCN-NOT: and_b32
; GCN: v_add_i32_e32 v0, vcc, 12, v0
define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 {
%ext = zext i8 %arg0 to i32
%add = add i32 %ext, 12
store i32 %add, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_i8_signext:
; GCN-NOT: v_bfe_i32
; GCN: v_add_i32_e32 v0, vcc, 12, v0
define void @void_func_i8_signext(i8 signext %arg0) #0 {
%ext = sext i8 %arg0 to i32
%add = add i32 %ext, 12
store i32 %add, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_i16:
; GCN: buffer_store_short v0, off
define void @void_func_i16(i16 %arg0) #0 {
store i16 %arg0, i16 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_i16_zeroext:
; GCN-NOT: v0
; GCN: v_add_i32_e32 v0, vcc, 12, v0
define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 {
%ext = zext i16 %arg0 to i32
%add = add i32 %ext, 12
store i32 %add, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_i16_signext:
; GCN-NOT: v0
; GCN: v_add_i32_e32 v0, vcc, 12, v0
define void @void_func_i16_signext(i16 signext %arg0) #0 {
%ext = sext i16 %arg0 to i32
%add = add i32 %ext, 12
store i32 %add, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_i32:
; GCN-NOT: v0
; GCN: buffer_store_dword v0, off
define void @void_func_i32(i32 %arg0) #0 {
store i32 %arg0, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_i64:
; GCN-NOT: v[0:1]
; GCN-NOT: v0
; GCN-NOT: v1
; GCN: buffer_store_dwordx2 v[0:1], off
define void @void_func_i64(i64 %arg0) #0 {
store i64 %arg0, i64 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_f16:
; VI-NOT: v0
; CI: v_cvt_f16_f32_e32 v0, v0
; GCN: buffer_store_short v0, off
define void @void_func_f16(half %arg0) #0 {
store half %arg0, half addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_f32
; GCN-NOT: v0
; GCN: buffer_store_dword v0, off
define void @void_func_f32(float %arg0) #0 {
store float %arg0, float addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_f64:
; GCN-NOT: v[0:1]
; GCN-NOT: v0
; GCN-NOT: v1
; GCN: buffer_store_dwordx2 v[0:1], off
define void @void_func_f64(double %arg0) #0 {
store double %arg0, double addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v2i32:
; GCN-NOT: v[0:1]
; GCN-NOT: v0
; GCN-NOT: v1
; GCN: buffer_store_dwordx2 v[0:1], off
define void @void_func_v2i32(<2 x i32> %arg0) #0 {
store <2 x i32> %arg0, <2 x i32> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v3i32:
; GCN-DAG: buffer_store_dword v2, off
; GCN-DAG: buffer_store_dwordx2 v[0:1], off
define void @void_func_v3i32(<3 x i32> %arg0) #0 {
store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v4i32:
; GCN: buffer_store_dwordx4 v[0:3], off
define void @void_func_v4i32(<4 x i32> %arg0) #0 {
store <4 x i32> %arg0, <4 x i32> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v5i32:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dword v4, off
define void @void_func_v5i32(<5 x i32> %arg0) #0 {
store <5 x i32> %arg0, <5 x i32> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v8i32:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
define void @void_func_v8i32(<8 x i32> %arg0) #0 {
store <8 x i32> %arg0, <8 x i32> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v16i32:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
define void @void_func_v16i32(<16 x i32> %arg0) #0 {
store <16 x i32> %arg0, <16 x i32> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v32i32:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
; GCN-DAG: buffer_store_dwordx4 v[16:19], off
; GCN-DAG: buffer_store_dwordx4 v[20:23], off
; GCN-DAG: buffer_store_dwordx4 v[24:27], off
; GCN-DAG: buffer_store_dwordx4 v[28:31], off
define void @void_func_v32i32(<32 x i32> %arg0) #0 {
store <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
ret void
}
; 1 over register limit
; GCN-LABEL: {{^}}void_func_v33i32:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s5
; GCN-DAG: buffer_store_dwordx4 v[16:19], off
; GCN-DAG: buffer_store_dwordx4 v[20:23], off
; GCN-DAG: buffer_store_dwordx4 v[24:27], off
; GCN-DAG: buffer_store_dwordx4 v[28:31], off
; GCN: buffer_store_dword [[STACKLOAD]], off
define void @void_func_v33i32(<33 x i32> %arg0) #0 {
store <33 x i32> %arg0, <33 x i32> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v2i64:
; GCN: buffer_store_dwordx4 v[0:3], off
define void @void_func_v2i64(<2 x i64> %arg0) #0 {
store <2 x i64> %arg0, <2 x i64> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v3i64:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx2 v[4:5], off
define void @void_func_v3i64(<3 x i64> %arg0) #0 {
store <3 x i64> %arg0, <3 x i64> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v4i64:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
define void @void_func_v4i64(<4 x i64> %arg0) #0 {
store <4 x i64> %arg0, <4 x i64> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v5i64:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
; GCN-DAG: buffer_store_dwordx2 v[8:9], off
define void @void_func_v5i64(<5 x i64> %arg0) #0 {
store <5 x i64> %arg0, <5 x i64> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v8i64:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
define void @void_func_v8i64(<8 x i64> %arg0) #0 {
store <8 x i64> %arg0, <8 x i64> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v16i64:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
; GCN-DAG: buffer_store_dwordx4 v[16:19], off
; GCN-DAG: buffer_store_dwordx4 v[20:23], off
; GCN-DAG: buffer_store_dwordx4 v[24:27], off
; GCN-DAG: buffer_store_dwordx4 v[28:31], off
define void @void_func_v16i64(<16 x i64> %arg0) #0 {
store <16 x i64> %arg0, <16 x i64> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v2i16:
; GFX9-NOT: v0
; GFX9: buffer_store_dword v0, off
define void @void_func_v2i16(<2 x i16> %arg0) #0 {
store <2 x i16> %arg0, <2 x i16> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v3i16:
; GCN-DAG: buffer_store_dword v0, off
; GCN-DAG: buffer_store_short v2, off
define void @void_func_v3i16(<3 x i16> %arg0) #0 {
store <3 x i16> %arg0, <3 x i16> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v4i16:
; GFX9-NOT: v0
; GFX9-NOT: v1
; GFX9: buffer_store_dwordx2 v[0:1], off
define void @void_func_v4i16(<4 x i16> %arg0) #0 {
store <4 x i16> %arg0, <4 x i16> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v5i16:
; GCN-DAG: buffer_store_short v4, off,
; GCN-DAG: buffer_store_dwordx2 v[1:2], off
define void @void_func_v5i16(<5 x i16> %arg0) #0 {
store <5 x i16> %arg0, <5 x i16> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v8i16:
; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
define void @void_func_v8i16(<8 x i16> %arg0) #0 {
store <8 x i16> %arg0, <8 x i16> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v16i16:
; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
; GFX9-DAG: buffer_store_dwordx4 v[4:7], off
define void @void_func_v16i16(<16 x i16> %arg0) #0 {
store <16 x i16> %arg0, <16 x i16> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v2f32:
; GCN-NOT: v[0:1]
; GCN-NOT: v0
; GCN-NOT: v1
; GCN: buffer_store_dwordx2 v[0:1], off
define void @void_func_v2f32(<2 x float> %arg0) #0 {
store <2 x float> %arg0, <2 x float> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v3f32:
; GCN-DAG: buffer_store_dword v2, off
; GCN-DAG: buffer_store_dwordx2 v[0:1], off
define void @void_func_v3f32(<3 x float> %arg0) #0 {
store <3 x float> %arg0, <3 x float> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v4f32:
; GCN: buffer_store_dwordx4 v[0:3], off
define void @void_func_v4f32(<4 x float> %arg0) #0 {
store <4 x float> %arg0, <4 x float> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v8f32:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
define void @void_func_v8f32(<8 x float> %arg0) #0 {
store <8 x float> %arg0, <8 x float> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v16f32:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
define void @void_func_v16f32(<16 x float> %arg0) #0 {
store <16 x float> %arg0, <16 x float> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v2f64:
; GCN: buffer_store_dwordx4 v[0:3], off
define void @void_func_v2f64(<2 x double> %arg0) #0 {
store <2 x double> %arg0, <2 x double> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v3f64:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx2 v[4:5], off
define void @void_func_v3f64(<3 x double> %arg0) #0 {
store <3 x double> %arg0, <3 x double> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v4f64:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
define void @void_func_v4f64(<4 x double> %arg0) #0 {
store <4 x double> %arg0, <4 x double> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v8f64:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
define void @void_func_v8f64(<8 x double> %arg0) #0 {
store <8 x double> %arg0, <8 x double> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v16f64:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
; GCN-DAG: buffer_store_dwordx4 v[16:19], off
; GCN-DAG: buffer_store_dwordx4 v[20:23], off
; GCN-DAG: buffer_store_dwordx4 v[24:27], off
; GCN-DAG: buffer_store_dwordx4 v[28:31], off
define void @void_func_v16f64(<16 x double> %arg0) #0 {
store <16 x double> %arg0, <16 x double> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v2f16:
; GFX9-NOT: v0
; GFX9: buffer_store_dword v0, off
define void @void_func_v2f16(<2 x half> %arg0) #0 {
store <2 x half> %arg0, <2 x half> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v3f16:
; GFX9-NOT: v0
; GCN-DAG: buffer_store_dword v0, off
; GCN-DAG: buffer_store_short v2, off
define void @void_func_v3f16(<3 x half> %arg0) #0 {
store <3 x half> %arg0, <3 x half> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v4f16:
; GFX9-NOT: v0
; GFX9-NOT: v1
; GFX9-NOT: v[0:1]
; GFX9: buffer_store_dwordx2 v[0:1], off
define void @void_func_v4f16(<4 x half> %arg0) #0 {
store <4 x half> %arg0, <4 x half> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v8f16:
; GFX9-NOT: v0
; GFX9-NOT: v1
; GFX9: buffer_store_dwordx4 v[0:3], off
define void @void_func_v8f16(<8 x half> %arg0) #0 {
store <8 x half> %arg0, <8 x half> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v16f16:
; GFX9-NOT: v0
; GFX9-NOT: v1
; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
; GFX9-DAG: buffer_store_dwordx4 v[4:7], off
define void @void_func_v16f16(<16 x half> %arg0) #0 {
store <16 x half> %arg0, <16 x half> addrspace(1)* undef
ret void
}
; Make sure there is no alignment requirement for passed vgprs.
; GCN-LABEL: {{^}}void_func_i32_i64_i32:
; GCN-NOT: v0
; GCN: buffer_store_dword v0, off
; GCN: buffer_store_dwordx2 v[1:2]
; GCN: buffer_store_dword v3
define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 {
store volatile i32 %arg0, i32 addrspace(1)* undef
store volatile i64 %arg1, i64 addrspace(1)* undef
store volatile i32 %arg2, i32 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_struct_i32:
; GCN-NOT: v0
; GCN: buffer_store_dword v0, off
define void @void_func_struct_i32({ i32 } %arg0) #0 {
store { i32 } %arg0, { i32 } addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_struct_i8_i32:
; GCN-DAG: buffer_store_byte v0, off
; GCN-DAG: buffer_store_dword v1, off
define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 {
store { i8, i32 } %arg0, { i8, i32 } addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32:
; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s5{{$}}
; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
; GCN-DAG: buffer_store_dword v[[ELT1]]
; GCN-DAG: buffer_store_byte v[[ELT0]]
define void @void_func_byval_struct_i8_i32({ i8, i32 }* byval %arg0) #0 {
%arg0.load = load { i8, i32 }, { i8, i32 }* %arg0
store { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2:
; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s5{{$}}
; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
; GCN: ds_write_b32 v0, v0
; GCN: s_setpc_b64
define void @void_func_byval_struct_i8_i32_x2({ i8, i32 }* byval %arg0, { i8, i32 }* byval %arg1, i32 %arg2) #0 {
%arg0.load = load volatile { i8, i32 }, { i8, i32 }* %arg0
%arg1.load = load volatile { i8, i32 }, { i8, i32 }* %arg1
store volatile { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef
store volatile { i8, i32 } %arg1.load, { i8, i32 } addrspace(1)* undef
store volatile i32 %arg2, i32 addrspace(3)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_byval_i32_byval_i64:
; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s5{{$}}
; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off
; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ARG1_LOAD0]]:[[ARG1_LOAD1]]{{\]}}, off
define void @void_func_byval_i32_byval_i64(i32* byval %arg0, i64* byval %arg1) #0 {
%arg0.load = load i32, i32* %arg0
%arg1.load = load i64, i64* %arg1
store i32 %arg0.load, i32 addrspace(1)* undef
store i64 %arg1.load, i64 addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v32i32_i32_i64:
; GCN-DAG: buffer_store_dwordx4 v[0:3], off
; GCN-DAG: buffer_store_dwordx4 v[4:7], off
; GCN-DAG: buffer_store_dwordx4 v[8:11], off
; GCN-DAG: buffer_store_dwordx4 v[12:15], off
; GCN-DAG: buffer_store_dwordx4 v[16:19], off
; GCN-DAG: buffer_store_dwordx4 v[20:23], off
; GCN-DAG: buffer_store_dwordx4 v[24:27], off
; GCN-DAG: buffer_store_dwordx4 v[28:31], off
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s5{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:4
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:8
; GCN: buffer_store_dword v[[LOAD_ARG1]]
; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 {
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
store volatile i32 %arg1, i32 addrspace(1)* undef
store volatile i64 %arg2, i64 addrspace(1)* undef
ret void
}
; FIXME: Different ext load types on CI vs. VI
; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16:
; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5{{$}}
; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:8{{$}}
; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:12{{$}}
; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:8{{$}}
; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:12{{$}}
; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]]
; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]]
; GCN: buffer_store_byte [[TRUNC_ARG1_I1]], off
; GCN: buffer_store_byte [[LOAD_ARG2]], off
; GCN: buffer_store_short [[LOAD_ARG3]], off
; VI: buffer_store_short [[LOAD_ARG4]], off
; CI: buffer_store_short [[CVT_ARG4]], off
define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 {
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
store volatile i1 %arg1, i1 addrspace(1)* undef
store volatile i8 %arg2, i8 addrspace(1)* undef
store volatile i16 %arg3, i16 addrspace(1)* undef
store volatile half %arg4, half addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32:
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]{{\]}}, off
; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 {
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
store volatile <2 x i32> %arg1, <2 x i32> addrspace(1)* undef
store volatile <2 x float> %arg2, <2 x float> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v32i32_v2i16_v2f16:
; GFX9-DAG: buffer_load_dword [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5{{$}}
; GFX9-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
; GFX9: buffer_store_dword [[LOAD_ARG1]], off
; GFX9: buffer_store_short [[LOAD_ARG2]], off
define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 {
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
store volatile <2 x i16> %arg1, <2 x i16> addrspace(1)* undef
store volatile <2 x half> %arg2, <2 x half> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64:
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 {
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
store volatile <2 x i64> %arg1, <2 x i64> addrspace(1)* undef
store volatile <2 x double> %arg2, <2 x double> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32:
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 {
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
store volatile <4 x i32> %arg1, <4 x i32> addrspace(1)* undef
store volatile <4 x float> %arg2, <4 x float> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32:
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:32{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:36{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:40{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:44{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:48{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:52{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:56{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:60{{$}}
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]{{\]}}, off
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_4]]:[[LOAD_ARG2_7]]{{\]}}, off
; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 x float> %arg2) #0 {
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
store volatile <8 x i32> %arg1, <8 x i32> addrspace(1)* undef
store volatile <8 x float> %arg2, <8 x float> addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}void_func_v32i32_v16i32_v16f32:
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s5 offset:32{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s5 offset:36{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s5 offset:40{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s5 offset:44{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s5 offset:48{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s5 offset:52{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s5 offset:56{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_15:[0-9]+]], off, s[0:3], s5 offset:60{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:64{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:68{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:72{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:76{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:80{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:84{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:88{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:92{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s5 offset:96{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s5 offset:100{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s5 offset:104{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s5 offset:108{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s5 offset:112{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s5 offset:116{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s5 offset:120{{$}}
; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s5 offset:124{{$}}
define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 {
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef
store volatile <16 x float> %arg2, <16 x float> addrspace(1)* undef
ret void
}
; Check there is no crash.
; GCN-LABEL: {{^}}void_func_v16i8:
define void @void_func_v16i8(<16 x i8> %arg0) #0 {
store volatile <16 x i8> %arg0, <16 x i8> addrspace(1)* undef
ret void
}
; Check there is no crash.
; GCN-LABEL: {{^}}void_func_v32i32_v16i8:
define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
store volatile <16 x i8> %arg1, <16 x i8> addrspace(1)* undef
ret void
}
attributes #0 = { nounwind }

View File

@ -0,0 +1,514 @@
; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}i1_func_void:
; GCN: buffer_load_ubyte v0, off
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define i1 @i1_func_void() #0 {
%val = load i1, i1 addrspace(1)* undef
ret i1 %val
}
; FIXME: Missing and?
; GCN-LABEL: {{^}}i1_zeroext_func_void:
; GCN: buffer_load_ubyte v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define zeroext i1 @i1_zeroext_func_void() #0 {
%val = load i1, i1 addrspace(1)* undef
ret i1 %val
}
; GCN-LABEL: {{^}}i1_signext_func_void:
; GCN: buffer_load_ubyte v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1{{$}}
; GCN-NEXT: s_setpc_b64
define signext i1 @i1_signext_func_void() #0 {
%val = load i1, i1 addrspace(1)* undef
ret i1 %val
}
; GCN-LABEL: {{^}}i8_func_void:
; GCN: buffer_load_ubyte v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define i8 @i8_func_void() #0 {
%val = load i8, i8 addrspace(1)* undef
ret i8 %val
}
; GCN-LABEL: {{^}}i8_zeroext_func_void:
; GCN: buffer_load_ubyte v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define zeroext i8 @i8_zeroext_func_void() #0 {
%val = load i8, i8 addrspace(1)* undef
ret i8 %val
}
; GCN-LABEL: {{^}}i8_signext_func_void:
; GCN: buffer_load_sbyte v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define signext i8 @i8_signext_func_void() #0 {
%val = load i8, i8 addrspace(1)* undef
ret i8 %val
}
; GCN-LABEL: {{^}}i16_func_void:
; GCN: buffer_load_ushort v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define i16 @i16_func_void() #0 {
%val = load i16, i16 addrspace(1)* undef
ret i16 %val
}
; GCN-LABEL: {{^}}i16_zeroext_func_void:
; GCN: buffer_load_ushort v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define zeroext i16 @i16_zeroext_func_void() #0 {
%val = load i16, i16 addrspace(1)* undef
ret i16 %val
}
; GCN-LABEL: {{^}}i16_signext_func_void:
; GCN: buffer_load_sshort v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define signext i16 @i16_signext_func_void() #0 {
%val = load i16, i16 addrspace(1)* undef
ret i16 %val
}
; GCN-LABEL: {{^}}i32_func_void:
; GCN: buffer_load_dword v0, off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define i32 @i32_func_void() #0 {
%val = load i32, i32 addrspace(1)* undef
ret i32 %val
}
; GCN-LABEL: {{^}}i64_func_void:
; GCN: buffer_load_dwordx2 v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define i64 @i64_func_void() #0 {
%val = load i64, i64 addrspace(1)* undef
ret i64 %val
}
; GCN-LABEL: {{^}}f32_func_void:
; GCN: buffer_load_dword v0, off, s[8:11], 0
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define float @f32_func_void() #0 {
%val = load float, float addrspace(1)* undef
ret float %val
}
; GCN-LABEL: {{^}}f64_func_void:
; GCN: buffer_load_dwordx2 v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define double @f64_func_void() #0 {
%val = load double, double addrspace(1)* undef
ret double %val
}
; GCN-LABEL: {{^}}v2i32_func_void:
; GCN: buffer_load_dwordx2 v[0:1], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <2 x i32> @v2i32_func_void() #0 {
%val = load <2 x i32>, <2 x i32> addrspace(1)* undef
ret <2 x i32> %val
}
; GCN-LABEL: {{^}}v3i32_func_void:
; GCN: buffer_load_dwordx4 v[0:3], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <3 x i32> @v3i32_func_void() #0 {
%val = load <3 x i32>, <3 x i32> addrspace(1)* undef
ret <3 x i32> %val
}
; GCN-LABEL: {{^}}v4i32_func_void:
; GCN: buffer_load_dwordx4 v[0:3], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <4 x i32> @v4i32_func_void() #0 {
%val = load <4 x i32>, <4 x i32> addrspace(1)* undef
ret <4 x i32> %val
}
; GCN-LABEL: {{^}}v5i32_func_void:
; GCN-DAG: buffer_load_dword v4, off
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <5 x i32> @v5i32_func_void() #0 {
%val = load volatile <5 x i32>, <5 x i32> addrspace(1)* undef
ret <5 x i32> %val
}
; GCN-LABEL: {{^}}v8i32_func_void:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <8 x i32> @v8i32_func_void() #0 {
%ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef
%val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
ret <8 x i32> %val
}
; GCN-LABEL: {{^}}v16i32_func_void:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
; GCN-DAG: buffer_load_dwordx4 v[12:15], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <16 x i32> @v16i32_func_void() #0 {
%ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef
%val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
ret <16 x i32> %val
}
; GCN-LABEL: {{^}}v32i32_func_void:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
; GCN-DAG: buffer_load_dwordx4 v[12:15], off
; GCN-DAG: buffer_load_dwordx4 v[16:19], off
; GCN-DAG: buffer_load_dwordx4 v[20:23], off
; GCN-DAG: buffer_load_dwordx4 v[24:27], off
; GCN-DAG: buffer_load_dwordx4 v[28:31], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <32 x i32> @v32i32_func_void() #0 {
%ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
%val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
ret <32 x i32> %val
}
; GCN-LABEL: {{^}}v2i64_func_void:
; GCN: buffer_load_dwordx4 v[0:3], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <2 x i64> @v2i64_func_void() #0 {
%val = load <2 x i64>, <2 x i64> addrspace(1)* undef
ret <2 x i64> %val
}
; GCN-LABEL: {{^}}v3i64_func_void:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <3 x i64> @v3i64_func_void() #0 {
%ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(2)* undef
%val = load <3 x i64>, <3 x i64> addrspace(1)* %ptr
ret <3 x i64> %val
}
; GCN-LABEL: {{^}}v4i64_func_void:
; GCN: buffer_load_dwordx4 v[0:3], off
; GCN: buffer_load_dwordx4 v[4:7], off
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <4 x i64> @v4i64_func_void() #0 {
%ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(2)* undef
%val = load <4 x i64>, <4 x i64> addrspace(1)* %ptr
ret <4 x i64> %val
}
; GCN-LABEL: {{^}}v5i64_func_void:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <5 x i64> @v5i64_func_void() #0 {
%ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(2)* undef
%val = load <5 x i64>, <5 x i64> addrspace(1)* %ptr
ret <5 x i64> %val
}
; GCN-LABEL: {{^}}v8i64_func_void:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
; GCN-DAG: buffer_load_dwordx4 v[12:15], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <8 x i64> @v8i64_func_void() #0 {
%ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(2)* undef
%val = load <8 x i64>, <8 x i64> addrspace(1)* %ptr
ret <8 x i64> %val
}
; GCN-LABEL: {{^}}v16i64_func_void:
; GCN-DAG: buffer_load_dwordx4 v[0:3], off
; GCN-DAG: buffer_load_dwordx4 v[4:7], off
; GCN-DAG: buffer_load_dwordx4 v[8:11], off
; GCN-DAG: buffer_load_dwordx4 v[12:15], off
; GCN-DAG: buffer_load_dwordx4 v[16:19], off
; GCN-DAG: buffer_load_dwordx4 v[20:23], off
; GCN-DAG: buffer_load_dwordx4 v[24:27], off
; GCN-DAG: buffer_load_dwordx4 v[28:31], off
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <16 x i64> @v16i64_func_void() #0 {
%ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(2)* undef
%val = load <16 x i64>, <16 x i64> addrspace(1)* %ptr
ret <16 x i64> %val
}
; GCN-LABEL: {{^}}v2i16_func_void:
; GFX9: buffer_load_dword v0, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64
define <2 x i16> @v2i16_func_void() #0 {
%val = load <2 x i16>, <2 x i16> addrspace(1)* undef
ret <2 x i16> %val
}
; GCN-LABEL: {{^}}v3i16_func_void:
; GFX9: buffer_load_dwordx2 v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64
define <3 x i16> @v3i16_func_void() #0 {
%val = load <3 x i16>, <3 x i16> addrspace(1)* undef
ret <3 x i16> %val
}
; GCN-LABEL: {{^}}v4i16_func_void:
; GFX9: buffer_load_dwordx2 v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64
define <4 x i16> @v4i16_func_void() #0 {
%val = load <4 x i16>, <4 x i16> addrspace(1)* undef
ret <4 x i16> %val
}
; FIXME: Should not scalarize
; GCN-LABEL: {{^}}v5i16_func_void:
; GFX9: buffer_load_dwordx2 v[0:1]
; GFX9: buffer_load_ushort v4
; GFX9: v_lshrrev_b32_e32 v3, 16, v1
; GFX9: v_mov_b32_e32 v2, v1
; GFX9: v_lshrrev_b32_e32 v3, 16, v0
; GCN: s_setpc_b64
define <5 x i16> @v5i16_func_void() #0 {
%ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(2)* undef
%val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr
ret <5 x i16> %val
}
; GCN-LABEL: {{^}}v8i16_func_void:
; GFX9-DAG: buffer_load_dwordx4 v[0:3], off
; GFX9: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64
define <8 x i16> @v8i16_func_void() #0 {
%ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(2)* undef
%val = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
ret <8 x i16> %val
}
; GCN-LABEL: {{^}}v16i16_func_void:
; GFX9: buffer_load_dwordx4 v[0:3], off
; GFX9: buffer_load_dwordx4 v[4:7], off
; GFX9: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64
define <16 x i16> @v16i16_func_void() #0 {
%ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(2)* undef
%val = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
ret <16 x i16> %val
}
; FIXME: Should pack
; GCN-LABEL: {{^}}v16i8_func_void:
; GCN-DAG: v12
; GCN-DAG: v13
; GCN-DAG: v14
; GCN-DAG: v15
define <16 x i8> @v16i8_func_void() #0 {
%ptr = load volatile <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef
%val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
ret <16 x i8> %val
}
; FIXME: Should pack
; GCN-LABEL: {{^}}v4i8_func_void:
; GCN: buffer_load_dword v0
; GCN-DAG: v_lshrrev_b32_e32 v2, 16, v0
; GCN-DAG: v_lshrrev_b32_e32 v3, 24, v0
; CI-DAG: v_bfe_u32 v1, v0, 8, 8
; VI-DAG: v_lshrrev_b16_e32 v1, 8, v0
; GCN: s_setpc_b64
define <4 x i8> @v4i8_func_void() #0 {
%ptr = load volatile <4 x i8> addrspace(1)*, <4 x i8> addrspace(1)* addrspace(2)* undef
%val = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
ret <4 x i8> %val
}
; GCN-LABEL: {{^}}struct_i8_i32_func_void:
; GCN-DAG: buffer_load_dword v1
; GCN-DAG: buffer_load_ubyte v0
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define {i8, i32} @struct_i8_i32_func_void() #0 {
%val = load { i8, i32 }, { i8, i32 } addrspace(1)* undef
ret { i8, i32 } %val
}
; GCN-LABEL: {{^}}void_func_sret_struct_i8_i32:
; GCN: buffer_load_ubyte [[VAL0:v[0-9]+]]
; GCN: buffer_load_dword [[VAL1:v[0-9]+]]
; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], s4 offen{{$}}
; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], s4 offen offset:4{{$}}
define void @void_func_sret_struct_i8_i32({ i8, i32 }* sret %arg0) #0 {
%val0 = load volatile i8, i8 addrspace(1)* undef
%val1 = load volatile i32, i32 addrspace(1)* undef
%gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
%gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
store i8 %val0, i8* %gep0
store i32 %val1, i32* %gep1
ret void
}
; GCN-LABEL: {{^}}v33i32_func_void:
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define <33 x i32> @v33i32_func_void() #0 {
%ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(2)* undef
%val = load <33 x i32>, <33 x i32> addrspace(1)* %ptr
ret <33 x i32> %val
}
; GCN-LABEL: {{^}}struct_v32i32_i32_func_void:
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
%ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(2)* undef
%val = load { <32 x i32>, i32 }, { <32 x i32>, i32 } addrspace(1)* %ptr
ret { <32 x i32>, i32 }%val
}
; GCN-LABEL: {{^}}struct_i32_v32i32_func_void:
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:132{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:136{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:140{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:144{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:148{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:152{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:156{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:160{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:164{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:168{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:172{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:176{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:180{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:184{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:188{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:192{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:196{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:200{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:204{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:208{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:212{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:216{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:220{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:224{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:228{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:232{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:236{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:240{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:244{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:248{{$}}
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:252{{$}}
; GCN: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64
define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
%ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(2)* undef
%val = load { i32, <32 x i32> }, { i32, <32 x i32> } addrspace(1)* %ptr
ret { i32, <32 x i32> }%val
}
attributes #0 = { nounwind }

View File

@ -27,7 +27,7 @@
; ELF: Symbol {
; ELF: Name: simple
; ELF: Size: 44
; ELF: Size: 48
; ELF: Type: Function (0x2)
; ELF: }
@ -41,14 +41,12 @@
; HSA: .p2align 2
; HSA: {{^}}simple:
; HSA-NOT: amd_kernel_code_t
; FIXME: Check this isn't a kernarg load when calling convention implemented.
; XHSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
; HSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
; Make sure we are setting the ATC bit:
; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000
; HSA-CI: s_mov_b32 s[[HI:[0-9]+]], 0x100f000
; On VI+ we also need to set MTYPE = 2
; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
; HSA-VI: s_mov_b32 s[[HI:[0-9]+]], 0x1100f000
; Make sure we generate flat store for HSA
; HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
@ -56,8 +54,9 @@
; HSA: .size simple, .Lfunc_end0-simple
; HSA: ; Function info:
; HSA-NOT: COMPUTE_PGM_RSRC2
define void @simple(i32 addrspace(1)* %out) {
define void @simple(i32 addrspace(1)* addrspace(2)* %ptr.out) {
entry:
%out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out
store i32 0, i32 addrspace(1)* %out
ret void
}

View File

@ -191,7 +191,7 @@ entry:
; CHECK: v_mov_b32_e32 v0, s0
; CHECK: v_mov_b32_e32 v1, s1
; CHECK: use v[0:1]
define void @i64_imm_input_phys_vgpr() {
define amdgpu_kernel void @i64_imm_input_phys_vgpr() {
entry:
call void asm sideeffect "; use $0 ", "{VGPR0_VGPR1}"(i64 123456)
ret void

View File

@ -1,4 +1,12 @@
# RUN: llc -o - %s -mtriple=amdgcn--amdhsa -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s
--- |
define amdgpu_kernel void @func0() {
ret void
}
...
---
# We should not detect any interference between v0/v1 here and only allocate
# sgpr0-sgpr3.