mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-19 02:52:53 +02:00
Implemented cost model for masked gather and scatter operations
The cost is calculated for all X86 targets. When gather/scatter instruction is not supported we calculate the cost of scalar sequence. Differential revision: http://reviews.llvm.org/D15677 llvm-svn: 256519
This commit is contained in:
parent
37e9125e68
commit
3ed0b3c7f1
@ -458,6 +458,16 @@ public:
|
|||||||
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
||||||
unsigned AddressSpace) const;
|
unsigned AddressSpace) const;
|
||||||
|
|
||||||
|
/// \return The cost of Gather or Scatter operation
|
||||||
|
/// \p Opcode - is a type of memory access Load or Store
|
||||||
|
/// \p DataTy - a vector type of the data to be loaded or stored
|
||||||
|
/// \p Ptr - pointer [or vector of pointers] - address[es] in memory
|
||||||
|
/// \p VariableMask - true when the memory access is predicated with a mask
|
||||||
|
/// that is not a compile-time constant
|
||||||
|
/// \p Alignment - alignment of single element
|
||||||
|
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
|
||||||
|
bool VariableMask, unsigned Alignment) const;
|
||||||
|
|
||||||
/// \return The cost of the interleaved memory operation.
|
/// \return The cost of the interleaved memory operation.
|
||||||
/// \p Opcode is the memory operation code
|
/// \p Opcode is the memory operation code
|
||||||
/// \p VecTy is the vector type of the interleaved access.
|
/// \p VecTy is the vector type of the interleaved access.
|
||||||
@ -485,10 +495,14 @@ public:
|
|||||||
/// ((v0+v2), (v1+v3), undef, undef)
|
/// ((v0+v2), (v1+v3), undef, undef)
|
||||||
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const;
|
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const;
|
||||||
|
|
||||||
/// \returns The cost of Intrinsic instructions.
|
/// \returns The cost of Intrinsic instructions. Types analysis only.
|
||||||
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||||
ArrayRef<Type *> Tys) const;
|
ArrayRef<Type *> Tys) const;
|
||||||
|
|
||||||
|
/// \returns The cost of Intrinsic instructions. Analyses the real arguments.
|
||||||
|
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||||
|
ArrayRef<Value *> Args) const;
|
||||||
|
|
||||||
/// \returns The cost of Call instructions.
|
/// \returns The cost of Call instructions.
|
||||||
int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) const;
|
int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) const;
|
||||||
|
|
||||||
@ -614,6 +628,9 @@ public:
|
|||||||
virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
|
virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
|
||||||
unsigned Alignment,
|
unsigned Alignment,
|
||||||
unsigned AddressSpace) = 0;
|
unsigned AddressSpace) = 0;
|
||||||
|
virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||||
|
Value *Ptr, bool VariableMask,
|
||||||
|
unsigned Alignment) = 0;
|
||||||
virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||||
unsigned Factor,
|
unsigned Factor,
|
||||||
ArrayRef<unsigned> Indices,
|
ArrayRef<unsigned> Indices,
|
||||||
@ -623,6 +640,8 @@ public:
|
|||||||
bool IsPairwiseForm) = 0;
|
bool IsPairwiseForm) = 0;
|
||||||
virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||||
ArrayRef<Type *> Tys) = 0;
|
ArrayRef<Type *> Tys) = 0;
|
||||||
|
virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||||
|
ArrayRef<Value *> Args) = 0;
|
||||||
virtual int getCallInstrCost(Function *F, Type *RetTy,
|
virtual int getCallInstrCost(Function *F, Type *RetTy,
|
||||||
ArrayRef<Type *> Tys) = 0;
|
ArrayRef<Type *> Tys) = 0;
|
||||||
virtual unsigned getNumberOfParts(Type *Tp) = 0;
|
virtual unsigned getNumberOfParts(Type *Tp) = 0;
|
||||||
@ -791,6 +810,12 @@ public:
|
|||||||
unsigned AddressSpace) override {
|
unsigned AddressSpace) override {
|
||||||
return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
|
return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
|
||||||
}
|
}
|
||||||
|
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||||
|
Value *Ptr, bool VariableMask,
|
||||||
|
unsigned Alignment) override {
|
||||||
|
return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
|
||||||
|
Alignment);
|
||||||
|
}
|
||||||
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
|
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
|
||||||
ArrayRef<unsigned> Indices, unsigned Alignment,
|
ArrayRef<unsigned> Indices, unsigned Alignment,
|
||||||
unsigned AddressSpace) override {
|
unsigned AddressSpace) override {
|
||||||
@ -805,6 +830,10 @@ public:
|
|||||||
ArrayRef<Type *> Tys) override {
|
ArrayRef<Type *> Tys) override {
|
||||||
return Impl.getIntrinsicInstrCost(ID, RetTy, Tys);
|
return Impl.getIntrinsicInstrCost(ID, RetTy, Tys);
|
||||||
}
|
}
|
||||||
|
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||||
|
ArrayRef<Value *> Args) override {
|
||||||
|
return Impl.getIntrinsicInstrCost(ID, RetTy, Args);
|
||||||
|
}
|
||||||
int getCallInstrCost(Function *F, Type *RetTy,
|
int getCallInstrCost(Function *F, Type *RetTy,
|
||||||
ArrayRef<Type *> Tys) override {
|
ArrayRef<Type *> Tys) override {
|
||||||
return Impl.getCallInstrCost(F, RetTy, Tys);
|
return Impl.getCallInstrCost(F, RetTy, Tys);
|
||||||
|
@ -301,6 +301,12 @@ public:
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
|
||||||
|
bool VariableMask,
|
||||||
|
unsigned Alignment) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||||
unsigned Factor,
|
unsigned Factor,
|
||||||
ArrayRef<unsigned> Indices,
|
ArrayRef<unsigned> Indices,
|
||||||
@ -313,6 +319,10 @@ public:
|
|||||||
ArrayRef<Type *> Tys) {
|
ArrayRef<Type *> Tys) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||||
|
ArrayRef<Value *> Args) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) {
|
unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) {
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -580,6 +580,39 @@ public:
|
|||||||
return Cost;
|
return Cost;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get intrinsic cost based on arguments
|
||||||
|
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||||
|
ArrayRef<Value *> Args) {
|
||||||
|
switch (IID) {
|
||||||
|
default: {
|
||||||
|
SmallVector<Type *, 4> Types;
|
||||||
|
for (Value *Op : Args)
|
||||||
|
Types.push_back(Op->getType());
|
||||||
|
return getIntrinsicInstrCost(IID, RetTy, Types);
|
||||||
|
}
|
||||||
|
case Intrinsic::masked_scatter: {
|
||||||
|
Value *Mask = Args[3];
|
||||||
|
bool VarMask = !isa<Constant>(Mask);
|
||||||
|
unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue();
|
||||||
|
return
|
||||||
|
static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Store,
|
||||||
|
Args[0]->getType(),
|
||||||
|
Args[1], VarMask,
|
||||||
|
Alignment);
|
||||||
|
}
|
||||||
|
case Intrinsic::masked_gather: {
|
||||||
|
Value *Mask = Args[2];
|
||||||
|
bool VarMask = !isa<Constant>(Mask);
|
||||||
|
unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue();
|
||||||
|
return
|
||||||
|
static_cast<T *>(this)->getGatherScatterOpCost(Instruction::Load,
|
||||||
|
RetTy, Args[0], VarMask,
|
||||||
|
Alignment);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get intrinsic cost based on argument types
|
||||||
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||||
ArrayRef<Type *> Tys) {
|
ArrayRef<Type *> Tys) {
|
||||||
unsigned ISD = 0;
|
unsigned ISD = 0;
|
||||||
|
@ -500,12 +500,12 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
|
|||||||
}
|
}
|
||||||
case Instruction::Call:
|
case Instruction::Call:
|
||||||
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
|
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
|
||||||
SmallVector<Type*, 4> Tys;
|
SmallVector<Value *, 4> Args;
|
||||||
for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J)
|
for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J)
|
||||||
Tys.push_back(II->getArgOperand(J)->getType());
|
Args.push_back(II->getArgOperand(J));
|
||||||
|
|
||||||
return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(),
|
return TTI->getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(),
|
||||||
Tys);
|
Args);
|
||||||
}
|
}
|
||||||
return -1;
|
return -1;
|
||||||
default:
|
default:
|
||||||
|
@ -280,6 +280,15 @@ int TargetTransformInfo::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
|
|||||||
return Cost;
|
return Cost;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||||
|
Value *Ptr, bool VariableMask,
|
||||||
|
unsigned Alignment) const {
|
||||||
|
int Cost = TTIImpl->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
|
||||||
|
Alignment);
|
||||||
|
assert(Cost >= 0 && "TTI should not produce negative costs!");
|
||||||
|
return Cost;
|
||||||
|
}
|
||||||
|
|
||||||
int TargetTransformInfo::getInterleavedMemoryOpCost(
|
int TargetTransformInfo::getInterleavedMemoryOpCost(
|
||||||
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
|
unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
|
||||||
unsigned Alignment, unsigned AddressSpace) const {
|
unsigned Alignment, unsigned AddressSpace) const {
|
||||||
@ -296,6 +305,13 @@ int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
|||||||
return Cost;
|
return Cost;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||||
|
ArrayRef<Value *> Args) const {
|
||||||
|
int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args);
|
||||||
|
assert(Cost >= 0 && "TTI should not produce negative costs!");
|
||||||
|
return Cost;
|
||||||
|
}
|
||||||
|
|
||||||
int TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy,
|
int TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy,
|
||||||
ArrayRef<Type *> Tys) const {
|
ArrayRef<Type *> Tys) const {
|
||||||
int Cost = TTIImpl->getCallInstrCost(F, RetTy, Tys);
|
int Cost = TTIImpl->getCallInstrCost(F, RetTy, Tys);
|
||||||
|
@ -1297,6 +1297,142 @@ int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
|
|||||||
return X86TTIImpl::getIntImmCost(Imm, Ty);
|
return X86TTIImpl::getIntImmCost(Imm, Ty);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Return an average cost of Gather / Scatter instruction, maybe improved later
|
||||||
|
int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
|
||||||
|
unsigned Alignment, unsigned AddressSpace) {
|
||||||
|
|
||||||
|
assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
|
||||||
|
unsigned VF = SrcVTy->getVectorNumElements();
|
||||||
|
|
||||||
|
// Try to reduce index size from 64 bit (default for GEP)
|
||||||
|
// to 32. It is essential for VF 16. If the index can't be reduced to 32, the
|
||||||
|
// operation will use 16 x 64 indices which do not fit in a zmm and needs
|
||||||
|
// to split. Also check that the base pointer is the same for all lanes,
|
||||||
|
// and that there's at most one variable index.
|
||||||
|
auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
|
||||||
|
unsigned IndexSize = DL.getPointerSizeInBits();
|
||||||
|
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
|
||||||
|
if (IndexSize < 64 || !GEP)
|
||||||
|
return IndexSize;
|
||||||
|
|
||||||
|
unsigned NumOfVarIndices = 0;
|
||||||
|
Value *Ptrs = GEP->getPointerOperand();
|
||||||
|
if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
|
||||||
|
return IndexSize;
|
||||||
|
for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
|
||||||
|
if (isa<Constant>(GEP->getOperand(i)))
|
||||||
|
continue;
|
||||||
|
Type *IndxTy = GEP->getOperand(i)->getType();
|
||||||
|
if (IndxTy->isVectorTy())
|
||||||
|
IndxTy = IndxTy->getVectorElementType();
|
||||||
|
if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
|
||||||
|
!isa<SExtInst>(GEP->getOperand(i))) ||
|
||||||
|
++NumOfVarIndices > 1)
|
||||||
|
return IndexSize; // 64
|
||||||
|
}
|
||||||
|
return (unsigned)32;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
// Trying to reduce IndexSize to 32 bits for vector 16.
|
||||||
|
// By default the IndexSize is equal to pointer size.
|
||||||
|
unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
|
||||||
|
DL.getPointerSizeInBits();
|
||||||
|
|
||||||
|
Type *IndexVTy = VectorType::get(IntegerType::get(getGlobalContext(),
|
||||||
|
IndexSize), VF);
|
||||||
|
std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
|
||||||
|
std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
|
||||||
|
int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
|
||||||
|
if (SplitFactor > 1) {
|
||||||
|
// Handle splitting of vector of pointers
|
||||||
|
Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
|
||||||
|
return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
|
||||||
|
AddressSpace);
|
||||||
|
}
|
||||||
|
|
||||||
|
// The gather / scatter cost is given by Intel architects. It is a rough
|
||||||
|
// number since we are looking at one instruction in a time.
|
||||||
|
const int GSOverhead = 2;
|
||||||
|
return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
|
||||||
|
Alignment, AddressSpace);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return the cost of full scalarization of gather / scatter operation.
|
||||||
|
///
|
||||||
|
/// Opcode - Load or Store instruction.
|
||||||
|
/// SrcVTy - The type of the data vector that should be gathered or scattered.
|
||||||
|
/// VariableMask - The mask is non-constant at compile time.
|
||||||
|
/// Alignment - Alignment for one element.
|
||||||
|
/// AddressSpace - pointer[s] address space.
|
||||||
|
///
|
||||||
|
int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
|
||||||
|
bool VariableMask, unsigned Alignment,
|
||||||
|
unsigned AddressSpace) {
|
||||||
|
unsigned VF = SrcVTy->getVectorNumElements();
|
||||||
|
|
||||||
|
int MaskUnpackCost = 0;
|
||||||
|
if (VariableMask) {
|
||||||
|
VectorType *MaskTy =
|
||||||
|
VectorType::get(Type::getInt1Ty(getGlobalContext()), VF);
|
||||||
|
MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
|
||||||
|
int ScalarCompareCost =
|
||||||
|
getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(getGlobalContext()),
|
||||||
|
nullptr);
|
||||||
|
int BranchCost = getCFInstrCost(Instruction::Br);
|
||||||
|
MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
|
||||||
|
}
|
||||||
|
|
||||||
|
// The cost of the scalar loads/stores.
|
||||||
|
int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
|
||||||
|
Alignment, AddressSpace);
|
||||||
|
|
||||||
|
int InsertExtractCost = 0;
|
||||||
|
if (Opcode == Instruction::Load)
|
||||||
|
for (unsigned i = 0; i < VF; ++i)
|
||||||
|
// Add the cost of inserting each scalar load into the vector
|
||||||
|
InsertExtractCost +=
|
||||||
|
getVectorInstrCost(Instruction::InsertElement, SrcVTy, i);
|
||||||
|
else
|
||||||
|
for (unsigned i = 0; i < VF; ++i)
|
||||||
|
// Add the cost of extracting each element out of the data vector
|
||||||
|
InsertExtractCost +=
|
||||||
|
getVectorInstrCost(Instruction::ExtractElement, SrcVTy, i);
|
||||||
|
|
||||||
|
return MemoryOpCost + MaskUnpackCost + InsertExtractCost;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Calculate the cost of Gather / Scatter operation
|
||||||
|
int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
|
||||||
|
Value *Ptr, bool VariableMask,
|
||||||
|
unsigned Alignment) {
|
||||||
|
assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
|
||||||
|
unsigned VF = SrcVTy->getVectorNumElements();
|
||||||
|
PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
|
||||||
|
if (!PtrTy && Ptr->getType()->isVectorTy())
|
||||||
|
PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
|
||||||
|
assert(PtrTy && "Unexpected type for Ptr argument");
|
||||||
|
unsigned AddressSpace = PtrTy->getAddressSpace();
|
||||||
|
|
||||||
|
bool Scalarize = false;
|
||||||
|
if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
|
||||||
|
(Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
|
||||||
|
Scalarize = true;
|
||||||
|
// Gather / Scatter for vector 2 is not profitable on KNL / SKX
|
||||||
|
// Vector-4 of gather/scatter instruction does not exist on KNL.
|
||||||
|
// We can extend it to 8 elements, but zeroing upper bits of
|
||||||
|
// the mask vector will add more instructions. Right now we give the scalar
|
||||||
|
// cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is
|
||||||
|
// better in the VariableMask case.
|
||||||
|
if (VF == 2 || (VF == 4 && !ST->hasVLX()))
|
||||||
|
Scalarize = true;
|
||||||
|
|
||||||
|
if (Scalarize)
|
||||||
|
return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace);
|
||||||
|
|
||||||
|
return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
|
||||||
|
}
|
||||||
|
|
||||||
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
|
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
|
||||||
Type *ScalarTy = DataTy->getScalarType();
|
Type *ScalarTy = DataTy->getScalarType();
|
||||||
int DataWidth = isa<PointerType>(ScalarTy) ?
|
int DataWidth = isa<PointerType>(ScalarTy) ?
|
||||||
|
@ -76,7 +76,8 @@ public:
|
|||||||
unsigned AddressSpace);
|
unsigned AddressSpace);
|
||||||
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
||||||
unsigned AddressSpace);
|
unsigned AddressSpace);
|
||||||
|
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
|
||||||
|
bool VariableMask, unsigned Alignment);
|
||||||
int getAddressComputationCost(Type *PtrTy, bool IsComplex);
|
int getAddressComputationCost(Type *PtrTy, bool IsComplex);
|
||||||
|
|
||||||
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
|
int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
|
||||||
@ -94,6 +95,11 @@ public:
|
|||||||
bool isLegalMaskedScatter(Type *DataType);
|
bool isLegalMaskedScatter(Type *DataType);
|
||||||
bool areInlineCompatible(const Function *Caller,
|
bool areInlineCompatible(const Function *Caller,
|
||||||
const Function *Callee) const;
|
const Function *Callee) const;
|
||||||
|
private:
|
||||||
|
int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
|
||||||
|
unsigned Alignment, unsigned AddressSpace);
|
||||||
|
int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr,
|
||||||
|
unsigned Alignment, unsigned AddressSpace);
|
||||||
|
|
||||||
/// @}
|
/// @}
|
||||||
};
|
};
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s -check-prefix=AVX2
|
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -cost-model -analyze < %s | FileCheck %s --check-prefix=AVX2
|
||||||
|
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=knl -cost-model -analyze < %s | FileCheck %s --check-prefix=KNL
|
||||||
|
; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=skx -cost-model -analyze < %s | FileCheck %s --check-prefix=SKX
|
||||||
|
|
||||||
|
|
||||||
; AVX2-LABEL: test1
|
; AVX2-LABEL: test1
|
||||||
@ -65,6 +67,217 @@ define <2 x i32> @test8(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %dst) {
|
|||||||
ret <2 x i32> %res
|
ret <2 x i32> %res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x double> %src0) {
|
||||||
|
|
||||||
|
; AVX2-LABEL: test_gather_2f64
|
||||||
|
; AVX2: Found an estimated cost of 7 {{.*}}.gather
|
||||||
|
|
||||||
|
; KNL-LABEL: test_gather_2f64
|
||||||
|
; KNL: Found an estimated cost of 7 {{.*}}.gather
|
||||||
|
|
||||||
|
; SKX-LABEL: test_gather_2f64
|
||||||
|
; SKX: Found an estimated cost of 7 {{.*}}.gather
|
||||||
|
|
||||||
|
%res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
|
||||||
|
ret <2 x double> %res
|
||||||
|
}
|
||||||
|
declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
|
||||||
|
|
||||||
|
define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0) {
|
||||||
|
|
||||||
|
; AVX2-LABEL: test_gather_4i32
|
||||||
|
; AVX2: Found an estimated cost of 16 {{.*}}.gather
|
||||||
|
|
||||||
|
; KNL-LABEL: test_gather_4i32
|
||||||
|
; KNL: Found an estimated cost of 16 {{.*}}.gather
|
||||||
|
|
||||||
|
; SKX-LABEL: test_gather_4i32
|
||||||
|
; SKX: Found an estimated cost of 6 {{.*}}.gather
|
||||||
|
|
||||||
|
%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
|
||||||
|
ret <4 x i32> %res
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x i32> @test_gather_4i32_const_mask(<4 x i32*> %ptrs, <4 x i32> %src0) {
|
||||||
|
|
||||||
|
; AVX2-LABEL: test_gather_4i32_const_mask
|
||||||
|
; AVX2: Found an estimated cost of 8 {{.*}}.gather
|
||||||
|
|
||||||
|
; KNL-LABEL: test_gather_4i32_const_mask
|
||||||
|
; KNL: Found an estimated cost of 8 {{.*}}.gather
|
||||||
|
|
||||||
|
; SKX-LABEL: test_gather_4i32_const_mask
|
||||||
|
; SKX: Found an estimated cost of 6 {{.*}}.gather
|
||||||
|
|
||||||
|
%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
|
||||||
|
ret <4 x i32> %res
|
||||||
|
}
|
||||||
|
declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0)
|
||||||
|
|
||||||
|
define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind) {
|
||||||
|
|
||||||
|
; AVX2-LABEL: test_gather_16f32_const_mask
|
||||||
|
; AVX2: Found an estimated cost of 30 {{.*}}.gather
|
||||||
|
|
||||||
|
; KNL-LABEL: test_gather_16f32_const_mask
|
||||||
|
; KNL: Found an estimated cost of 18 {{.*}}.gather
|
||||||
|
|
||||||
|
; SKX-LABEL: test_gather_16f32_const_mask
|
||||||
|
; SKX: Found an estimated cost of 18 {{.*}}.gather
|
||||||
|
|
||||||
|
%sext_ind = sext <16 x i32> %ind to <16 x i64>
|
||||||
|
%gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
|
||||||
|
|
||||||
|
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
|
||||||
|
ret <16 x float>%res
|
||||||
|
}
|
||||||
|
|
||||||
|
define <16 x float> @test_gather_16f32_var_mask(float* %base, <16 x i32> %ind, <16 x i1>%mask) {
|
||||||
|
|
||||||
|
; AVX2-LABEL: test_gather_16f32_var_mask
|
||||||
|
; AVX2: Found an estimated cost of 62 {{.*}}.gather
|
||||||
|
|
||||||
|
; KNL-LABEL: test_gather_16f32_var_mask
|
||||||
|
; KNL: Found an estimated cost of 18 {{.*}}.gather
|
||||||
|
|
||||||
|
; SKX-LABEL: test_gather_16f32_var_mask
|
||||||
|
; SKX: Found an estimated cost of 18 {{.*}}.gather
|
||||||
|
|
||||||
|
%sext_ind = sext <16 x i32> %ind to <16 x i64>
|
||||||
|
%gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind
|
||||||
|
|
||||||
|
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
|
||||||
|
ret <16 x float>%res
|
||||||
|
}
|
||||||
|
|
||||||
|
define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i32> %ind, <16 x i1>%mask) {
|
||||||
|
|
||||||
|
; AVX2-LABEL: test_gather_16f32_ra_var_mask
|
||||||
|
; AVX2: Found an estimated cost of 62 {{.*}}.gather
|
||||||
|
|
||||||
|
; KNL-LABEL: test_gather_16f32_ra_var_mask
|
||||||
|
; KNL: Found an estimated cost of 20 {{.*}}.gather
|
||||||
|
|
||||||
|
; SKX-LABEL: test_gather_16f32_ra_var_mask
|
||||||
|
; SKX: Found an estimated cost of 20 {{.*}}.gather
|
||||||
|
|
||||||
|
%sext_ind = sext <16 x i32> %ind to <16 x i64>
|
||||||
|
%gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind
|
||||||
|
|
||||||
|
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
|
||||||
|
ret <16 x float>%res
|
||||||
|
}
|
||||||
|
|
||||||
|
define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind) {
|
||||||
|
|
||||||
|
; AVX2-LABEL: test_gather_16f32_const_mask2
|
||||||
|
; AVX2: Found an estimated cost of 30 {{.*}}.gather
|
||||||
|
|
||||||
|
; KNL-LABEL: test_gather_16f32_const_mask2
|
||||||
|
; KNL: Found an estimated cost of 18 {{.*}}.gather
|
||||||
|
|
||||||
|
; SKX-LABEL: test_gather_16f32_const_mask2
|
||||||
|
; SKX: Found an estimated cost of 18 {{.*}}.gather
|
||||||
|
|
||||||
|
%broadcast.splatinsert = insertelement <16 x float*> undef, float* %base, i32 0
|
||||||
|
%broadcast.splat = shufflevector <16 x float*> %broadcast.splatinsert, <16 x float*> undef, <16 x i32> zeroinitializer
|
||||||
|
|
||||||
|
%sext_ind = sext <16 x i32> %ind to <16 x i64>
|
||||||
|
%gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
|
||||||
|
|
||||||
|
%res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
|
||||||
|
ret <16 x float>%res
|
||||||
|
}
|
||||||
|
|
||||||
|
define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
|
||||||
|
; AVX2-LABEL: test_scatter_16i32
|
||||||
|
; AVX2: Found an estimated cost of 64 {{.*}}.scatter
|
||||||
|
|
||||||
|
; KNL-LABEL: test_scatter_16i32
|
||||||
|
; KNL: Found an estimated cost of 18 {{.*}}.scatter
|
||||||
|
|
||||||
|
; SKX-LABEL: test_scatter_16i32
|
||||||
|
; SKX: Found an estimated cost of 18 {{.*}}.scatter
|
||||||
|
|
||||||
|
%broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
|
||||||
|
%broadcast.splat = shufflevector <16 x i32*> %broadcast.splatinsert, <16 x i32*> undef, <16 x i32> zeroinitializer
|
||||||
|
|
||||||
|
%gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
|
||||||
|
%imask = bitcast i16 %mask to <16 x i1>
|
||||||
|
call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) {
|
||||||
|
; AVX2-LABEL: test_scatter_8i32
|
||||||
|
; AVX2: Found an estimated cost of 32 {{.*}}.scatter
|
||||||
|
|
||||||
|
; KNL-LABEL: test_scatter_8i32
|
||||||
|
; KNL: Found an estimated cost of 10 {{.*}}.scatter
|
||||||
|
|
||||||
|
; SKX-LABEL: test_scatter_8i32
|
||||||
|
; SKX: Found an estimated cost of 10 {{.*}}.scatter
|
||||||
|
|
||||||
|
call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
declare void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask)
|
||||||
|
|
||||||
|
define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
|
||||||
|
; AVX2-LABEL: test_scatter_4i32
|
||||||
|
; AVX2: Found an estimated cost of 16 {{.*}}.scatter
|
||||||
|
|
||||||
|
; KNL-LABEL: test_scatter_4i32
|
||||||
|
; KNL: Found an estimated cost of 16 {{.*}}.scatter
|
||||||
|
|
||||||
|
; SKX-LABEL: test_scatter_4i32
|
||||||
|
; SKX: Found an estimated cost of 6 {{.*}}.scatter
|
||||||
|
|
||||||
|
call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @test_gather_4f32(float* %ptr, <4 x i32> %ind, <4 x i1>%mask) {
|
||||||
|
|
||||||
|
; AVX2-LABEL: test_gather_4f32
|
||||||
|
; AVX2: Found an estimated cost of 15 {{.*}}.gather
|
||||||
|
|
||||||
|
; KNL-LABEL: test_gather_4f32
|
||||||
|
; KNL: Found an estimated cost of 15 {{.*}}.gather
|
||||||
|
|
||||||
|
; SKX-LABEL: test_gather_4f32
|
||||||
|
; SKX: Found an estimated cost of 6 {{.*}}.gather
|
||||||
|
|
||||||
|
%sext_ind = sext <4 x i32> %ind to <4 x i64>
|
||||||
|
%gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
|
||||||
|
|
||||||
|
%res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
|
||||||
|
ret <4 x float>%res
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @test_gather_4f32_const_mask(float* %ptr, <4 x i32> %ind) {
|
||||||
|
|
||||||
|
; AVX2-LABEL: test_gather_4f32_const_mask
|
||||||
|
; AVX2: Found an estimated cost of 7 {{.*}}.gather
|
||||||
|
|
||||||
|
; KNL-LABEL: test_gather_4f32_const_mask
|
||||||
|
; KNL: Found an estimated cost of 7 {{.*}}.gather
|
||||||
|
|
||||||
|
; SKX-LABEL: test_gather_4f32_const_mask
|
||||||
|
; SKX: Found an estimated cost of 6 {{.*}}.gather
|
||||||
|
|
||||||
|
%sext_ind = sext <4 x i32> %ind to <4 x i64>
|
||||||
|
%gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind
|
||||||
|
|
||||||
|
%res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
|
||||||
|
ret <4 x float>%res
|
||||||
|
}
|
||||||
|
|
||||||
|
declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32, <4 x i1> %mask, <4 x float> )
|
||||||
|
declare void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32, <4 x i1> %mask)
|
||||||
|
declare void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask)
|
||||||
|
declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>)
|
||||||
|
|
||||||
declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
|
declare <16 x i32> @llvm.masked.load.v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
|
||||||
declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
|
declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
|
||||||
|
Loading…
Reference in New Issue
Block a user