diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 5b18aefbd78..152f8ecdf29 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1128,6 +1128,13 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &CurrentProgramInfo, const MachineFunction &MF) const { + const Function &F = MF.getFunction(); + + // Avoid asserting on erroneous cases. + if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL && + F.getCallingConv() != CallingConv::SPIR_KERNEL) + return; + const SIMachineFunctionInfo *MFI = MF.getInfo(); const GCNSubtarget &STM = MF.getSubtarget(); @@ -1174,9 +1181,8 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, if (STM.isXNACKEnabled()) Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; - // FIXME: Should use getKernArgSize - Out.kernarg_segment_byte_size = - STM.getKernArgSegmentSize(MF.getFunction(), MFI->getExplicitKernArgSize()); + unsigned MaxKernArgAlign; + Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign); Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; @@ -1185,7 +1191,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, // These alignment values are specified in powers of two, so alignment = // 2^n. The minimum alignment is 2^4 = 16. Out.kernarg_segment_alignment = std::max((size_t)4, - countTrailingZeros(MFI->getMaxKernArgAlign())); + countTrailingZeros(MaxKernArgAlign)); if (STM.debuggerEmitPrologue()) { Out.debug_wavefront_private_segment_offset_sgpr = diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index b33079ae4ba..29e93a9d9d3 100644 --- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -209,15 +209,16 @@ Kernel::CodeProps::Metadata MetadataStreamer::getHSACodeProps( const Function &F = MF.getFunction(); // Avoid asserting on erroneous cases. - if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL) + if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL && + F.getCallingConv() != CallingConv::SPIR_KERNEL) return HSACodeProps; - HSACodeProps.mKernargSegmentSize = - STM.getKernArgSegmentSize(F, MFI.getExplicitKernArgSize()); + unsigned MaxKernArgAlign; + HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F, + MaxKernArgAlign); HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize; HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize; - HSACodeProps.mKernargSegmentAlign = - std::max(uint32_t(4), MFI.getMaxKernArgAlign()); + HSACodeProps.mKernargSegmentAlign = std::max(MaxKernArgAlign, 4u); HSACodeProps.mWavefrontSize = STM.getWavefrontSize(); HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR; HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR; diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index acdedab7e13..583a09e34ab 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -30,6 +30,7 @@ #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -40,18 +41,6 @@ #include "llvm/Support/KnownBits.h" using namespace llvm; -static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - MachineFunction &MF = State.getMachineFunction(); - AMDGPUMachineFunction *MFI = MF.getInfo(); - - uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(), - ArgFlags.getOrigAlign()); - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); - return true; -} - static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State, @@ -910,74 +899,118 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, /// for each individual part is i8. We pass the memory type as LocVT to the /// calling convention analysis function and the register type (Ins[x].VT) as /// the ValVT. -void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State, - const SmallVectorImpl &Ins) const { - for (unsigned i = 0, e = Ins.size(); i != e; ++i) { - const ISD::InputArg &In = Ins[i]; - EVT MemVT; +void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( + CCState &State, + const SmallVectorImpl &Ins) const { + const MachineFunction &MF = State.getMachineFunction(); + const Function &Fn = MF.getFunction(); + LLVMContext &Ctx = Fn.getParent()->getContext(); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); + const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn); - unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT); + unsigned MaxAlign = 1; + uint64_t ExplicitArgOffset = 0; + const DataLayout &DL = Fn.getParent()->getDataLayout(); - if (!Subtarget->isAmdHsaOS() && - (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) { - // The ABI says the caller will extend these values to 32-bits. - MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32; - } else if (NumRegs == 1) { - // This argument is not split, so the IR type is the memory type. - assert(!In.Flags.isSplit()); - if (In.ArgVT.isExtended()) { - // We have an extended type, like i24, so we should just use the register type - MemVT = In.VT; + unsigned InIndex = 0; + + for (const Argument &Arg : Fn.args()) { + Type *BaseArgTy = Arg.getType(); + unsigned Align = DL.getABITypeAlignment(BaseArgTy); + MaxAlign = std::max(Align, MaxAlign); + unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy); + + uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset; + ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; + + // We're basically throwing away everything passed into us and starting over + // to get accurate in-memory offsets. The "PartOffset" is completely useless + // to us as computed in Ins. + // + // We also need to figure out what type legalization is trying to do to get + // the correct memory offsets. + + SmallVector ValueVTs; + SmallVector Offsets; + ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset); + + for (unsigned Value = 0, NumValues = ValueVTs.size(); + Value != NumValues; ++Value) { + uint64_t BasePartOffset = Offsets[Value]; + + EVT ArgVT = ValueVTs[Value]; + EVT MemVT = ArgVT; + MVT RegisterVT = + getRegisterTypeForCallingConv(Ctx, ArgVT); + unsigned NumRegs = + getNumRegistersForCallingConv(Ctx, ArgVT); + + if (!Subtarget->isAmdHsaOS() && + (ArgVT == MVT::i16 || ArgVT == MVT::i8 || ArgVT == MVT::f16)) { + // The ABI says the caller will extend these values to 32-bits. + MemVT = ArgVT.isInteger() ? MVT::i32 : MVT::f32; + } else if (NumRegs == 1) { + // This argument is not split, so the IR type is the memory type. + if (ArgVT.isExtended()) { + // We have an extended type, like i24, so we should just use the + // register type. + MemVT = RegisterVT; + } else { + MemVT = ArgVT; + } + } else if (ArgVT.isVector() && RegisterVT.isVector() && + ArgVT.getScalarType() == RegisterVT.getScalarType()) { + assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements()); + // We have a vector value which has been split into a vector with + // the same scalar type, but fewer elements. This should handle + // all the floating-point vector types. + MemVT = RegisterVT; + } else if (ArgVT.isVector() && + ArgVT.getVectorNumElements() == NumRegs) { + // This arg has been split so that each element is stored in a separate + // register. + MemVT = ArgVT.getScalarType(); + } else if (ArgVT.isExtended()) { + // We have an extended type, like i65. + MemVT = RegisterVT; } else { - MemVT = In.ArgVT; + unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs; + assert(ArgVT.getStoreSizeInBits() % NumRegs == 0); + if (RegisterVT.isInteger()) { + MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); + } else if (RegisterVT.isVector()) { + assert(!RegisterVT.getScalarType().isFloatingPoint()); + unsigned NumElements = RegisterVT.getVectorNumElements(); + assert(MemoryBits % NumElements == 0); + // This vector type has been split into another vector type with + // a different elements size. + EVT ScalarVT = EVT::getIntegerVT(State.getContext(), + MemoryBits / NumElements); + MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); + } else { + llvm_unreachable("cannot deduce memory type."); + } } - } else if (In.ArgVT.isVector() && In.VT.isVector() && - In.ArgVT.getScalarType() == In.VT.getScalarType()) { - assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements()); - // We have a vector value which has been split into a vector with - // the same scalar type, but fewer elements. This should handle - // all the floating-point vector types. - MemVT = In.VT; - } else if (In.ArgVT.isVector() && - In.ArgVT.getVectorNumElements() == NumRegs) { - // This arg has been split so that each element is stored in a separate - // register. - MemVT = In.ArgVT.getScalarType(); - } else if (In.ArgVT.isExtended()) { - // We have an extended type, like i65. - MemVT = In.VT; - } else { - unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs; - assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0); - if (In.VT.isInteger()) { - MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); - } else if (In.VT.isVector()) { - assert(!In.VT.getScalarType().isFloatingPoint()); - unsigned NumElements = In.VT.getVectorNumElements(); - assert(MemoryBits % NumElements == 0); - // This vector type has been split into another vector type with - // a different elements size. - EVT ScalarVT = EVT::getIntegerVT(State.getContext(), - MemoryBits / NumElements); - MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); - } else { - llvm_unreachable("cannot deduce memory type."); + + // Convert one element vectors to scalar. + if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) + MemVT = MemVT.getScalarType(); + + if (MemVT.isExtended()) { + // This should really only happen if we have vec3 arguments + assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); + MemVT = MemVT.getPow2VectorType(State.getContext()); + } + + unsigned PartOffset = 0; + for (unsigned i = 0; i != NumRegs; ++i) { + State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT, + BasePartOffset + PartOffset, + MemVT.getSimpleVT(), + CCValAssign::Full)); + PartOffset += MemVT.getStoreSize(); } } - - // Convert one element vectors to scalar. - if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) - MemVT = MemVT.getScalarType(); - - if (MemVT.isExtended()) { - // This should really only happen if we have vec3 arguments - assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); - MemVT = MemVT.getPow2VectorType(State.getContext()); - } - - assert(MemVT.isSimple()); - allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags, - State); } } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 1e027dd6712..096e40230c6 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -122,8 +122,11 @@ protected: SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl &Results) const; - void analyzeFormalArgumentsCompute(CCState &State, - const SmallVectorImpl &Ins) const; + + void analyzeFormalArgumentsCompute( + CCState &State, + const SmallVectorImpl &Ins) const; + public: AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI); diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp index 3c5760804b3..8cc7e38f7b2 100644 --- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp @@ -77,8 +77,9 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F); + unsigned MaxAlign; // FIXME: Alignment is broken broken with explicit arg offset.; - const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F); + const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign); if (TotalKernArgSize == 0) return false; @@ -91,13 +92,11 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) { Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize)); unsigned AS = KernArgSegment->getType()->getPointerAddressSpace(); - unsigned MaxAlign = 1; uint64_t ExplicitArgOffset = 0; for (Argument &Arg : F.args()) { Type *ArgTy = Arg.getType(); unsigned Align = DL.getABITypeAlignment(ArgTy); - MaxAlign = std::max(Align, MaxAlign); unsigned Size = DL.getTypeSizeInBits(ArgTy); unsigned AllocSize = DL.getTypeAllocSize(ArgTy); diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 0574c991ee6..13b4b50149c 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -24,16 +24,23 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath), MemoryBound(false), WaveLimiter(false) { + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF); + // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset, // except reserved size is not correctly aligned. + const Function &F = MF.getFunction(); if (auto *Resolver = MF.getMMI().getResolver()) { if (AMDGPUPerfHintAnalysis *PHA = static_cast( Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) { - MemoryBound = PHA->isMemoryBound(&MF.getFunction()); - WaveLimiter = PHA->needsWaveLimiter(&MF.getFunction()); + MemoryBound = PHA->isMemoryBound(&F); + WaveLimiter = PHA->needsWaveLimiter(&F); } } + + CallingConv::ID CC = F.getCallingConv(); + if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) + ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign); } unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h index 2c4bf328008..8d6b871bc03 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -23,8 +23,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo { SmallDenseMap LocalMemoryObjects; protected: - uint64_t ExplicitKernArgSize; - unsigned MaxKernArgAlign; + uint64_t ExplicitKernArgSize; // Cache for this. + unsigned MaxKernArgAlign; // Cache for this. /// Number of bytes in the LDS that are being used. unsigned LDSSize; @@ -44,17 +44,6 @@ protected: public: AMDGPUMachineFunction(const MachineFunction &MF); - uint64_t allocateKernArg(uint64_t Size, unsigned Align) { - assert(isPowerOf2_32(Align)); - ExplicitKernArgSize = alignTo(ExplicitKernArgSize, Align); - - uint64_t Result = ExplicitKernArgSize; - ExplicitKernArgSize += Size; - - MaxKernArgAlign = std::max(Align, MaxKernArgAlign); - return Result; - } - uint64_t getExplicitKernArgSize() const { return ExplicitKernArgSize; } diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 3efc564c855..98b49070fa9 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -209,7 +209,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, FeatureDisable(false), InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), - TLInfo(TM, *this), + TLInfo(TM, *this), FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { AS = AMDGPU::getAMDGPUAS(TT); CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); @@ -406,6 +406,44 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { return true; } +uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F, + unsigned &MaxAlign) const { + assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL || + F.getCallingConv() == CallingConv::SPIR_KERNEL); + + const DataLayout &DL = F.getParent()->getDataLayout(); + uint64_t ExplicitArgBytes = 0; + MaxAlign = 1; + + for (const Argument &Arg : F.args()) { + Type *ArgTy = Arg.getType(); + + unsigned Align = DL.getABITypeAlignment(ArgTy); + uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); + ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; + MaxAlign = std::max(MaxAlign, Align); + } + + return ExplicitArgBytes; +} + +unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F, + unsigned &MaxAlign) const { + uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign); + + unsigned ExplicitOffset = getExplicitKernelArgOffset(F); + + uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; + unsigned ImplicitBytes = getImplicitArgNumBytes(F); + if (ImplicitBytes != 0) { + unsigned Alignment = getAlignmentForImplicitArgPtr(); + TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; + } + + // Being able to dereference past the end is useful for emitting scalar loads. + return alignTo(TotalSize, 4); +} + R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, const TargetMachine &TM) : R600GenSubtargetInfo(TT, GPU, FS), @@ -446,40 +484,6 @@ bool GCNSubtarget::isVGPRSpillingEnabled(const Function& F) const { return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv()); } -uint64_t GCNSubtarget::getExplicitKernArgSize(const Function &F) const { - assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL); - - const DataLayout &DL = F.getParent()->getDataLayout(); - uint64_t ExplicitArgBytes = 0; - for (const Argument &Arg : F.args()) { - Type *ArgTy = Arg.getType(); - - unsigned Align = DL.getABITypeAlignment(ArgTy); - uint64_t AllocSize = DL.getTypeAllocSize(ArgTy); - ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize; - } - - return ExplicitArgBytes; -} - -unsigned GCNSubtarget::getKernArgSegmentSize(const Function &F, - int64_t ExplicitArgBytes) const { - if (ExplicitArgBytes == -1) - ExplicitArgBytes = getExplicitKernArgSize(F); - - unsigned ExplicitOffset = getExplicitKernelArgOffset(F); - - uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes; - unsigned ImplicitBytes = getImplicitArgNumBytes(F); - if (ImplicitBytes != 0) { - unsigned Alignment = getAlignmentForImplicitArgPtr(); - TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes; - } - - // Being able to dereference past the end is useful for emitting scalar loads. - return alignTo(TotalSize, 4); -} - unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { if (SGPRs <= 80) diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index d9806d6133c..62310973365 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -51,7 +51,7 @@ public: enum Generation { R600 = 0, R700 = 1, - EVERGREEN = 2, + EVERGREEN = 2, NORTHERN_ISLANDS = 3, SOUTHERN_ISLANDS = 4, SEA_ISLANDS = 5, @@ -82,7 +82,7 @@ public: static const AMDGPUSubtarget &get(const MachineFunction &MF); static const AMDGPUSubtarget &get(const TargetMachine &TM, - const Function &F); + const Function &F); /// \returns Default range flat work group size for a calling convention. std::pair getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; @@ -231,6 +231,18 @@ public: /// Creates value range metadata on an workitemid.* inrinsic call or load. bool makeLIDRangeMetadata(Instruction *I) const; + /// \returns Number of bytes of arguments that are passed to a shader or + /// kernel in addition to the explicit ones declared for the function. + unsigned getImplicitArgNumBytes(const Function &F) const { + if (isMesaKernel(F)) + return 16; + return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); + } + uint64_t getExplicitKernArgSize(const Function &F, + unsigned &MaxAlign) const; + unsigned getKernArgSegmentSize(const Function &F, + unsigned &MaxAlign) const; + virtual ~AMDGPUSubtarget() {} }; @@ -669,14 +681,6 @@ public: return D16PreservesUnusedBits; } - /// \returns Number of bytes of arguments that are passed to a shader or - /// kernel in addition to the explicit ones declared for the function. - unsigned getImplicitArgNumBytes(const Function &F) const { - if (isMesaKernel(F)) - return 16; - return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); - } - // Scratch is allocated in 256 dword per wave blocks for the entire // wavefront. When viewed from the perspecive of an arbitrary workitem, this // is 4-byte aligned. @@ -825,10 +829,6 @@ public: return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; } - uint64_t getExplicitKernArgSize(const Function &F) const; - unsigned getKernArgSegmentSize(const Function &F, - int64_t ExplicitArgBytes = -1) const; - /// Return the maximum number of waves per SIMD for kernels using \p SGPRs /// SGPRs unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; diff --git a/lib/Target/AMDGPU/R600.td b/lib/Target/AMDGPU/R600.td index ff96928211c..5c9c1c1ed50 100644 --- a/lib/Target/AMDGPU/R600.td +++ b/lib/Target/AMDGPU/R600.td @@ -52,8 +52,3 @@ def CC_R600 : CallingConv<[ T30_XYZW, T31_XYZW, T32_XYZW ]>>> ]>; - -// Calling convention for compute kernels -def CC_R600_Kernel : CallingConv<[ - CCCustom<"allocateKernArg"> -]>; diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 4110e6a28d6..113d6249fa6 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -50,18 +50,6 @@ using namespace llvm; -static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, - ISD::ArgFlagsTy ArgFlags, CCState &State) { - MachineFunction &MF = State.getMachineFunction(); - AMDGPUMachineFunction *MFI = MF.getInfo(); - - uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(), - ArgFlags.getOrigAlign()); - State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); - return true; -} - #include "R600GenCallingConv.inc" R600TargetLowering::R600TargetLowering(const TargetMachine &TM, @@ -234,7 +222,7 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMA, MVT::f32, Expand); setOperationAction(ISD::FMA, MVT::f64, Expand); } - + // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we // need it for R600. if (!Subtarget->hasFP32Denormals()) @@ -1583,7 +1571,7 @@ CCAssignFn *R600TargetLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::C: case CallingConv::Fast: case CallingConv::Cold: - return CC_R600_Kernel; + llvm_unreachable("kernels should not be handled here"); case CallingConv::AMDGPU_VS: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: @@ -1658,13 +1646,12 @@ SDValue R600TargetLowering::LowerFormalArguments( unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset(); unsigned PartOffset = VA.getLocMemOffset(); - unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF.getFunction()) + - VA.getLocMemOffset(); MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase); SDValue Arg = DAG.getLoad( ISD::UNINDEXED, Ext, VT, DL, Chain, - DAG.getConstant(Offset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), PtrInfo, + DAG.getConstant(PartOffset, DL, MVT::i32), DAG.getUNDEF(MVT::i32), + PtrInfo, MemVT, /* Alignment = */ 4, MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 5721669bf7c..177cec982f3 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1164,8 +1164,8 @@ SDValue SITargetLowering::lowerKernargMemParameter( // Try to avoid using an extload by loading earlier than the argument address, // and extracting the relevant bits. The load should hopefully be merged with // the previous argument. - if (Align < 4) { - assert(MemVT.getStoreSize() < 4); + if (MemVT.getStoreSize() < 4 && Align < 4) { + // TODO: Handle align < 4 and size >= 4 (can happen with packed structs). int64_t AlignDownOffset = alignDown(Offset, 4); int64_t OffsetDiff = Offset - AlignDownOffset; @@ -1781,7 +1781,6 @@ SDValue SITargetLowering::LowerFormalArguments( // FIXME: Alignment of explicit arguments totally broken with non-0 explicit // kern arg offset. const unsigned KernelArgBaseAlign = 16; - const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset(Fn); for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; @@ -1797,11 +1796,9 @@ SDValue SITargetLowering::LowerFormalArguments( VT = Ins[i].VT; EVT MemVT = VA.getLocVT(); - const uint64_t Offset = ExplicitOffset + VA.getLocMemOffset(); + const uint64_t Offset = VA.getLocMemOffset(); unsigned Align = MinAlign(KernelArgBaseAlign, Offset); - // The first 36 bytes of the input buffer contains information about - // thread group and global sizes for clover. SDValue Arg = lowerKernargMemParameter( DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]); Chains.push_back(Arg.getValue(1)); diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 7c5bc7431e4..0d5ff75e37e 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -54,6 +54,16 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) Occupancy = getMaxWavesPerEU(); limitOccupancy(MF); + CallingConv::ID CC = F.getCallingConv(); + + if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { + if (!F.arg_empty()) + KernargSegmentPtr = true; + WorkGroupIDX = true; + WorkItemIDX = true; + } else if (CC == CallingConv::AMDGPU_PS) { + PSInputAddr = AMDGPU::getInitialPSInputAddr(F); + } if (!isEntryFunction()) { // Non-entry functions have no special inputs for now, other registers @@ -73,21 +83,11 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) } else { if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) { KernargSegmentPtr = true; - assert(MaxKernArgAlign == 0); - MaxKernArgAlign = ST.getAlignmentForImplicitArgPtr(); + MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(), + MaxKernArgAlign); } } - CallingConv::ID CC = F.getCallingConv(); - if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { - if (!F.arg_empty()) - KernargSegmentPtr = true; - WorkGroupIDX = true; - WorkItemIDX = true; - } else if (CC == CallingConv::AMDGPU_PS) { - PSInputAddr = AMDGPU::getInitialPSInputAddr(F); - } - if (ST.debuggerEmitPrologue()) { // Enable everything. WorkGroupIDX = true; diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll index 5c2c868476b..9492b710d13 100644 --- a/test/CodeGen/AMDGPU/kernel-args.ll +++ b/test/CodeGen/AMDGPU/kernel-args.ll @@ -589,6 +589,17 @@ entry: ; ret void ; } +; FUNC-LABEL: {{^}}i65_arg: +; HSA-VI: kernarg_segment_byte_size = 24 +; HSA-VI: kernarg_segment_alignment = 4 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { +entry: + store i65 %in, i65 addrspace(1)* %out, align 4 + ret void +} + ; FUNC-LABEL: {{^}}i1_arg: ; HSA-VI: kernarg_segment_byte_size = 12 ; HSA-VI: kernarg_segment_alignment = 4 @@ -651,7 +662,7 @@ define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwi } ; FUNC-LABEL: {{^}}empty_struct_arg: -; HSA: kernarg_segment_byte_size = 0 +; HSA-VI: kernarg_segment_byte_size = 0 define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { ret void } @@ -667,11 +678,11 @@ define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { ; FIXME: Total argument size is computed wrong ; FUNC-LABEL: {{^}}struct_argument_alignment: -; HSA: kernarg_segment_byte_size = 40 -; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 -; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 -; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +; HSA-VI: kernarg_segment_byte_size = 40 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { %val0 = extractvalue {i32, i64} %arg0, 0 %val1 = extractvalue {i32, i64} %arg0, 1 @@ -687,11 +698,11 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, ; No padding between i8 and next struct, but round up at end to 4 byte ; multiple. ; FUNC-LABEL: {{^}}packed_struct_argument_alignment: -; HSA: kernarg_segment_byte_size = 28 -; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 -; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 -; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0xc -; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 +; HSA-VI: kernarg_segment_byte_size = 28 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10 define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { %val0 = extractvalue <{i32, i64}> %arg0, 0 %val1 = extractvalue <{i32, i64}> %arg0, 1 @@ -703,3 +714,47 @@ define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, store volatile i64 %val3, i64 addrspace(1)* null ret void } + +; GCN-LABEL: {{^}}struct_argument_alignment_after: +; HSA-VI: kernarg_segment_byte_size = 64 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 +define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { + %val0 = extractvalue {i32, i64} %arg0, 0 + %val1 = extractvalue {i32, i64} %arg0, 1 + %val2 = extractvalue {i32, i64} %arg2, 0 + %val3 = extractvalue {i32, i64} %arg2, 1 + store volatile i32 %val0, i32 addrspace(1)* null + store volatile i64 %val1, i64 addrspace(1)* null + store volatile i32 %val2, i32 addrspace(1)* null + store volatile i64 %val3, i64 addrspace(1)* null + store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}array_3xi32: +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc +define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { + store volatile i16 %arg0, i16 addrspace(1)* undef + store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef + ret void +} + +; FIXME: Why not all scalar loads? +; GCN-LABEL: {{^}}array_3xi16: +; HSA-VI: s_add_u32 s{{[0-9]+}}, s4, 2 +; HSA-VI: s_addc_u32 s{{[0-9]+}}, s5, 0 +; HSA-VI: flat_load_ushort +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 +define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { + store volatile i8 %arg0, i8 addrspace(1)* undef + store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef + ret void +} diff --git a/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll b/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll new file mode 100644 index 00000000000..a1bb6c28e74 --- /dev/null +++ b/test/CodeGen/AMDGPU/kernel-argument-dag-lowering.ll @@ -0,0 +1,132 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -amdgpu-ir-lower-kernel-arguments=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s + +; Repeat of some problematic tests in kernel-args.ll, with the IR +; argument lowering pass disabled. Struct padding needs to be +; accounted for, as well as legalization of types changing offsets. + +; FUNC-LABEL: {{^}}i1_arg: +; HSA-VI: kernarg_segment_byte_size = 12 +; HSA-VI: kernarg_segment_alignment = 4 + +; GCN: s_load_dword s +; GCN: s_and_b32 +define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind { + store i1 %x, i1 addrspace(1)* %out, align 1 + ret void +} + +; FUNC-LABEL: {{^}}v3i8_arg: +; HSA-VI: kernarg_segment_byte_size = 12 +; HSA-VI: kernarg_segment_alignment = 4 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 +define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind { +entry: + store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}i65_arg: +; HSA-VI: kernarg_segment_byte_size = 24 +; HSA-VI: kernarg_segment_alignment = 4 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind { +entry: + store i65 %in, i65 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}empty_struct_arg: +; HSA-VI: kernarg_segment_byte_size = 0 +define amdgpu_kernel void @empty_struct_arg({} %in) nounwind { + ret void +} + +; The correct load offsets for these: +; load 4 from 0, +; load 8 from 8 +; load 4 from 24 +; load 8 from 32 + +; With the SelectionDAG argument lowering, the alignments for the +; struct members is not properly considered, making these wrong. + +; FIXME: Total argument size is computed wrong +; FUNC-LABEL: {{^}}struct_argument_alignment: +; HSA-VI: kernarg_segment_byte_size = 40 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) { + %val0 = extractvalue {i32, i64} %arg0, 0 + %val1 = extractvalue {i32, i64} %arg0, 1 + %val2 = extractvalue {i32, i64} %arg1, 0 + %val3 = extractvalue {i32, i64} %arg1, 1 + store volatile i32 %val0, i32 addrspace(1)* null + store volatile i64 %val1, i64 addrspace(1)* null + store volatile i32 %val2, i32 addrspace(1)* null + store volatile i64 %val3, i64 addrspace(1)* null + ret void +} + +; No padding between i8 and next struct, but round up at end to 4 byte +; multiple. +; FUNC-LABEL: {{^}}packed_struct_argument_alignment: +; HSA-VI: kernarg_segment_byte_size = 28 +; HSA-VI: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13 +; HSA-VI: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4 +define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) { + %val0 = extractvalue <{i32, i64}> %arg0, 0 + %val1 = extractvalue <{i32, i64}> %arg0, 1 + %val2 = extractvalue <{i32, i64}> %arg1, 0 + %val3 = extractvalue <{i32, i64}> %arg1, 1 + store volatile i32 %val0, i32 addrspace(1)* null + store volatile i64 %val1, i64 addrspace(1)* null + store volatile i32 %val2, i32 addrspace(1)* null + store volatile i64 %val3, i64 addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}struct_argument_alignment_after: +; HSA-VI: kernarg_segment_byte_size = 64 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18 +; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20 +; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30 +define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) { + %val0 = extractvalue {i32, i64} %arg0, 0 + %val1 = extractvalue {i32, i64} %arg0, 1 + %val2 = extractvalue {i32, i64} %arg2, 0 + %val3 = extractvalue {i32, i64} %arg2, 1 + store volatile i32 %val0, i32 addrspace(1)* null + store volatile i64 %val1, i64 addrspace(1)* null + store volatile i32 %val2, i32 addrspace(1)* null + store volatile i64 %val3, i64 addrspace(1)* null + store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null + ret void +} + +; GCN-LABEL: {{^}}array_3xi32: +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc +define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) { + store volatile i16 %arg0, i16 addrspace(1)* undef + store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}array_3xi16: +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0 +; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4 +define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) { + store volatile i8 %arg0, i8 addrspace(1)* undef + store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef + ret void +} diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll index f860a122a88..6a9191e7dcb 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.implicitarg.ptr.ll @@ -33,7 +33,7 @@ define amdgpu_kernel void @opencl_kernel_implicitarg_ptr_empty() #1 { ; GCN: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: kernarg_segment_byte_size = 112 -; MESA: kernarg_segment_byte_size = 464 +; MESA: kernarg_segment_byte_size = 128 ; HSA: s_load_dword s0, s[4:5], 0x1c define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 { @@ -47,7 +47,7 @@ define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 { ; GCN: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: kernarg_segment_byte_size = 160 -; MESA: kernarg_segment_byte_size = 464 +; MESA: kernarg_segment_byte_size = 128 ; HSA: s_load_dword s0, s[4:5], 0x1c define amdgpu_kernel void @opencl_kernel_implicitarg_ptr([112 x i8]) #1 { @@ -118,10 +118,10 @@ define amdgpu_kernel void @opencl_kernel_call_implicitarg_ptr_func_empty() #1 { ; GCN-LABEL: {{^}}kernel_call_implicitarg_ptr_func: ; GCN: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: kernarg_segment_byte_size = 112 -; MESA: kernarg_segment_byte_size = 464 +; MESA: kernarg_segment_byte_size = 128 ; HSA: s_add_u32 s6, s4, 0x70 -; MESA: s_add_u32 s6, s4, 0x1c0 +; MESA: s_add_u32 s6, s4, 0x70 ; GCN: s_addc_u32 s7, s5, 0{{$}} ; GCN: s_swappc_b64 @@ -133,10 +133,9 @@ define amdgpu_kernel void @kernel_call_implicitarg_ptr_func([112 x i8]) #0 { ; GCN-LABEL: {{^}}opencl_kernel_call_implicitarg_ptr_func: ; GCN: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: kernarg_segment_byte_size = 160 -; MESA: kernarg_segment_byte_size = 464 +; MESA: kernarg_segment_byte_size = 128 -; HSA: s_add_u32 s6, s4, 0x70 -; MESA: s_add_u32 s6, s4, 0x1c0 +; GCN: s_add_u32 s6, s4, 0x70 ; GCN: s_addc_u32 s7, s5, 0{{$}} ; GCN: s_swappc_b64 @@ -219,8 +218,7 @@ define void @opencl_func_kernarg_implicitarg_ptr() #0 { ; GCN-LABEL: {{^}}kernel_call_kernarg_implicitarg_ptr_func: ; GCN: s_mov_b64 s[6:7], s[4:5] -; HSA: s_add_u32 s8, s6, 0x70 -; MESA: s_add_u32 s8, s6, 0x1c0 +; GCN: s_add_u32 s8, s6, 0x70 ; GCN: s_addc_u32 s9, s7, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_kernarg_implicitarg_ptr_func([112 x i8]) #0 { diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll index 6c1bc9eaa76..5853d8d8e4e 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll @@ -79,7 +79,7 @@ define amdgpu_kernel void @opencl_test_implicit_alignment(i32 addrspace(1)* %out ; CO-V2: enable_sgpr_kernarg_segment_ptr = 1 ; HSA: kernarg_segment_byte_size = 0 ; OS-MESA3D: kernarg_segment_byte_size = 16 -; CO-V2: kernarg_segment_alignment = 32 +; CO-V2: kernarg_segment_alignment = 4 ; HSA: s_load_dword s{{[0-9]+}}, s[4:5] define amdgpu_kernel void @test_no_kernargs() #1 {