mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 04:32:44 +01:00
[TTI][ARM][MVE] Refine gather/scatter cost model
Refines the gather/scatter cost model, but also changes the TTI function getIntrinsicInstrCost to accept an additional parameter which is needed for the gather/scatter cost evaluation. This did require trivial changes in some non-ARM backends to adopt the new parameter. Extending gathers and truncating scatters are now priced cheaper. Differential Revision: https://reviews.llvm.org/D75525
This commit is contained in:
parent
57135c3cb8
commit
d262d5349f
@ -966,8 +966,11 @@ public:
|
||||
/// \p VariableMask - true when the memory access is predicated with a mask
|
||||
/// that is not a compile-time constant
|
||||
/// \p Alignment - alignment of single element
|
||||
/// \p I - the optional original context instruction, if one exists, e.g. the
|
||||
/// load/store to transform or the call to the gather/scatter intrinsic
|
||||
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
|
||||
bool VariableMask, unsigned Alignment) const;
|
||||
bool VariableMask, unsigned Alignment,
|
||||
const Instruction *I = nullptr) const;
|
||||
|
||||
/// \return The cost of the interleaved memory operation.
|
||||
/// \p Opcode is the memory operation code
|
||||
@ -1006,16 +1009,22 @@ public:
|
||||
/// \returns The cost of Intrinsic instructions. Analyses the real arguments.
|
||||
/// Three cases are handled: 1. scalar instruction 2. vector instruction
|
||||
/// 3. scalar instruction which is to be vectorized with VF.
|
||||
/// I is the optional original context instruction holding the call to the
|
||||
/// intrinsic
|
||||
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF,
|
||||
unsigned VF = 1) const;
|
||||
unsigned VF = 1,
|
||||
const Instruction *I = nullptr) const;
|
||||
|
||||
/// \returns The cost of Intrinsic instructions. Types analysis only.
|
||||
/// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the
|
||||
/// arguments and the return value will be computed based on types.
|
||||
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed = UINT_MAX) const;
|
||||
/// I is the optional original context instruction holding the call to the
|
||||
/// intrinsic
|
||||
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys,
|
||||
FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed = UINT_MAX,
|
||||
const Instruction *I = nullptr) const;
|
||||
|
||||
/// \returns The cost of Call instructions.
|
||||
int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) const;
|
||||
@ -1340,9 +1349,9 @@ public:
|
||||
virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
|
||||
unsigned Alignment,
|
||||
unsigned AddressSpace) = 0;
|
||||
virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||
Value *Ptr, bool VariableMask,
|
||||
unsigned Alignment) = 0;
|
||||
virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
|
||||
bool VariableMask, unsigned Alignment,
|
||||
const Instruction *I = nullptr) = 0;
|
||||
virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||
unsigned Factor,
|
||||
ArrayRef<unsigned> Indices,
|
||||
@ -1355,10 +1364,12 @@ public:
|
||||
virtual int getMinMaxReductionCost(Type *Ty, Type *CondTy,
|
||||
bool IsPairwiseForm, bool IsUnsigned) = 0;
|
||||
virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed) = 0;
|
||||
ArrayRef<Type *> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed,
|
||||
const Instruction *I) = 0;
|
||||
virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) = 0;
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF,
|
||||
unsigned VF, const Instruction *I) = 0;
|
||||
virtual int getCallInstrCost(Function *F, Type *RetTy,
|
||||
ArrayRef<Type *> Tys) = 0;
|
||||
virtual unsigned getNumberOfParts(Type *Tp) = 0;
|
||||
@ -1759,11 +1770,11 @@ public:
|
||||
unsigned AddressSpace) override {
|
||||
return Impl.getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
|
||||
}
|
||||
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||
Value *Ptr, bool VariableMask,
|
||||
unsigned Alignment) override {
|
||||
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
|
||||
bool VariableMask, unsigned Alignment,
|
||||
const Instruction *I = nullptr) override {
|
||||
return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
|
||||
Alignment);
|
||||
Alignment, I);
|
||||
}
|
||||
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
|
||||
ArrayRef<unsigned> Indices, unsigned Alignment,
|
||||
@ -1781,15 +1792,18 @@ public:
|
||||
bool IsPairwiseForm, bool IsUnsigned) override {
|
||||
return Impl.getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned);
|
||||
}
|
||||
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys,
|
||||
FastMathFlags FMF, unsigned ScalarizationCostPassed) override {
|
||||
return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
|
||||
ScalarizationCostPassed);
|
||||
}
|
||||
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) override {
|
||||
return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
|
||||
}
|
||||
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed,
|
||||
const Instruction *I) override {
|
||||
return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
|
||||
ScalarizationCostPassed, I);
|
||||
}
|
||||
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF,
|
||||
unsigned VF, const Instruction *I) override {
|
||||
return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I);
|
||||
}
|
||||
int getCallInstrCost(Function *F, Type *RetTy,
|
||||
ArrayRef<Type *> Tys) override {
|
||||
return Impl.getCallInstrCost(F, RetTy, Tys);
|
||||
|
@ -481,8 +481,8 @@ public:
|
||||
}
|
||||
|
||||
unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
|
||||
bool VariableMask,
|
||||
unsigned Alignment) {
|
||||
bool VariableMask, unsigned Alignment,
|
||||
const Instruction *I = nullptr) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -497,11 +497,13 @@ public:
|
||||
|
||||
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed) {
|
||||
unsigned ScalarizationCostPassed,
|
||||
const Instruction *I) {
|
||||
return 1;
|
||||
}
|
||||
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF,
|
||||
unsigned VF, const Instruction *I) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -1072,7 +1072,8 @@ public:
|
||||
/// Get intrinsic cost based on arguments.
|
||||
unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF,
|
||||
unsigned VF = 1) {
|
||||
unsigned VF = 1,
|
||||
const Instruction *I = nullptr) {
|
||||
unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1);
|
||||
assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
|
||||
auto *ConcreteTTI = static_cast<T *>(this);
|
||||
@ -1109,16 +1110,17 @@ public:
|
||||
Value *Mask = Args[3];
|
||||
bool VarMask = !isa<Constant>(Mask);
|
||||
unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue();
|
||||
return ConcreteTTI->getGatherScatterOpCost(
|
||||
Instruction::Store, Args[0]->getType(), Args[1], VarMask, Alignment);
|
||||
return ConcreteTTI->getGatherScatterOpCost(Instruction::Store,
|
||||
Args[0]->getType(), Args[1],
|
||||
VarMask, Alignment, I);
|
||||
}
|
||||
case Intrinsic::masked_gather: {
|
||||
assert(VF == 1 && "Can't vectorize types here.");
|
||||
Value *Mask = Args[2];
|
||||
bool VarMask = !isa<Constant>(Mask);
|
||||
unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue();
|
||||
return ConcreteTTI->getGatherScatterOpCost(Instruction::Load, RetTy,
|
||||
Args[0], VarMask, Alignment);
|
||||
return ConcreteTTI->getGatherScatterOpCost(
|
||||
Instruction::Load, RetTy, Args[0], VarMask, Alignment, I);
|
||||
}
|
||||
case Intrinsic::experimental_vector_reduce_add:
|
||||
case Intrinsic::experimental_vector_reduce_mul:
|
||||
@ -1180,7 +1182,8 @@ public:
|
||||
/// based on types.
|
||||
unsigned getIntrinsicInstrCost(
|
||||
Intrinsic::ID IID, Type *RetTy, ArrayRef<Type *> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max()) {
|
||||
unsigned ScalarizationCostPassed = std::numeric_limits<unsigned>::max(),
|
||||
const Instruction *I = nullptr) {
|
||||
auto *ConcreteTTI = static_cast<T *>(this);
|
||||
|
||||
SmallVector<unsigned, 2> ISDs;
|
||||
|
@ -674,9 +674,10 @@ int TargetTransformInfo::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
|
||||
|
||||
int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||
Value *Ptr, bool VariableMask,
|
||||
unsigned Alignment) const {
|
||||
unsigned Alignment,
|
||||
const Instruction *I) const {
|
||||
int Cost = TTIImpl->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
|
||||
Alignment);
|
||||
Alignment, I);
|
||||
assert(Cost >= 0 && "TTI should not produce negative costs!");
|
||||
return Cost;
|
||||
}
|
||||
@ -694,17 +695,21 @@ int TargetTransformInfo::getInterleavedMemoryOpCost(
|
||||
}
|
||||
|
||||
int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed) const {
|
||||
ArrayRef<Type *> Tys,
|
||||
FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed,
|
||||
const Instruction *I) const {
|
||||
int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
|
||||
ScalarizationCostPassed);
|
||||
ScalarizationCostPassed, I);
|
||||
assert(Cost >= 0 && "TTI should not produce negative costs!");
|
||||
return Cost;
|
||||
}
|
||||
|
||||
int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) const {
|
||||
int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
|
||||
ArrayRef<Value *> Args,
|
||||
FastMathFlags FMF, unsigned VF,
|
||||
const Instruction *I) const {
|
||||
int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I);
|
||||
assert(Cost >= 0 && "TTI should not produce negative costs!");
|
||||
return Cost;
|
||||
}
|
||||
@ -1339,8 +1344,8 @@ int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
|
||||
if (auto *FPMO = dyn_cast<FPMathOperator>(II))
|
||||
FMF = FPMO->getFastMathFlags();
|
||||
|
||||
return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(),
|
||||
Args, FMF);
|
||||
return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
|
||||
FMF, 1, II);
|
||||
}
|
||||
return -1;
|
||||
default:
|
||||
|
@ -478,14 +478,14 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
|
||||
|
||||
template <typename T>
|
||||
int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<T *> Args,
|
||||
FastMathFlags FMF, unsigned VF) {
|
||||
ArrayRef<T *> Args, FastMathFlags FMF,
|
||||
unsigned VF, const Instruction *I) {
|
||||
if (ID != Intrinsic::fma)
|
||||
return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
|
||||
return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I);
|
||||
|
||||
EVT OrigTy = TLI->getValueType(DL, RetTy);
|
||||
if (!OrigTy.isSimple()) {
|
||||
return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
|
||||
return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I);
|
||||
}
|
||||
|
||||
// Legalize the type.
|
||||
@ -507,16 +507,17 @@ int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
}
|
||||
|
||||
int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value*> Args, FastMathFlags FMF,
|
||||
unsigned VF) {
|
||||
return getIntrinsicInstrCost<Value>(ID, RetTy, Args, FMF, VF);
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF,
|
||||
unsigned VF, const Instruction *I) {
|
||||
return getIntrinsicInstrCost<Value>(ID, RetTy, Args, FMF, VF, I);
|
||||
}
|
||||
|
||||
int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed) {
|
||||
unsigned ScalarizationCostPassed,
|
||||
const Instruction *I) {
|
||||
return getIntrinsicInstrCost<Type>(ID, RetTy, Tys, FMF,
|
||||
ScalarizationCostPassed);
|
||||
ScalarizationCostPassed, I);
|
||||
}
|
||||
|
||||
unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
|
||||
@ -889,7 +890,7 @@ unsigned GCNTTIImpl::getUserCost(const User *U,
|
||||
if (auto *FPMO = dyn_cast<FPMathOperator>(II))
|
||||
FMF = FPMO->getFastMathFlags();
|
||||
return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
|
||||
FMF);
|
||||
FMF, 1, II);
|
||||
} else {
|
||||
return BaseT::getUserCost(U, Operands);
|
||||
}
|
||||
|
@ -219,15 +219,16 @@ public:
|
||||
Type *Ty,
|
||||
bool IsPairwise);
|
||||
template <typename T>
|
||||
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
ArrayRef<T *> Args, FastMathFlags FMF,
|
||||
unsigned VF);
|
||||
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy, ArrayRef<T *> Args,
|
||||
FastMathFlags FMF, unsigned VF,
|
||||
const Instruction *I = nullptr);
|
||||
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed = UINT_MAX);
|
||||
unsigned ScalarizationCostPassed = UINT_MAX,
|
||||
const Instruction *I = nullptr);
|
||||
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF,
|
||||
unsigned VF = 1);
|
||||
unsigned VF = 1, const Instruction *I = nullptr);
|
||||
int getMinMaxReductionCost(Type *Ty, Type *CondTy,
|
||||
bool IsPairwiseForm,
|
||||
bool IsUnsigned);
|
||||
|
@ -863,16 +863,17 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(
|
||||
|
||||
unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||
Value *Ptr, bool VariableMask,
|
||||
unsigned Alignment) {
|
||||
unsigned Alignment,
|
||||
const Instruction *I) {
|
||||
using namespace PatternMatch;
|
||||
if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
|
||||
return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
|
||||
Alignment);
|
||||
Alignment, I);
|
||||
|
||||
assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
|
||||
VectorType *VTy = cast<VectorType>(DataTy);
|
||||
|
||||
// TODO: Splitting, once we do that.
|
||||
// TODO: trunc/sext/zext the result/input
|
||||
|
||||
unsigned NumElems = VTy->getNumElements();
|
||||
unsigned EltSize = VTy->getScalarSizeInBits();
|
||||
@ -889,19 +890,54 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||
unsigned ScalarCost =
|
||||
NumElems * LT.first + BaseT::getScalarizationOverhead(DataTy, {});
|
||||
|
||||
// TODO: Cost extended gathers or trunc stores correctly.
|
||||
if (EltSize * NumElems != 128 || NumElems < 4)
|
||||
return ScalarCost;
|
||||
if (Alignment < EltSize / 8)
|
||||
return ScalarCost;
|
||||
|
||||
unsigned ExtSize = EltSize;
|
||||
// Check whether there's a single user that asks for an extended type
|
||||
if (I != nullptr) {
|
||||
// Dependent of the caller of this function, a gather instruction will
|
||||
// either have opcode Instruction::Load or be a call to the masked_gather
|
||||
// intrinsic
|
||||
if ((I->getOpcode() == Instruction::Load ||
|
||||
match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
|
||||
I->hasOneUse()) {
|
||||
const User *Us = *I->users().begin();
|
||||
if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
|
||||
// only allow valid type combinations
|
||||
unsigned TypeSize =
|
||||
cast<Instruction>(Us)->getType()->getScalarSizeInBits();
|
||||
if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
|
||||
(TypeSize == 16 && EltSize == 8)) &&
|
||||
TypeSize * NumElems == 128) {
|
||||
ExtSize = TypeSize;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Check whether the input data needs to be truncated
|
||||
TruncInst *T;
|
||||
if ((I->getOpcode() == Instruction::Store ||
|
||||
match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
|
||||
(T = dyn_cast<TruncInst>(I->getOperand(0)))) {
|
||||
// Only allow valid type combinations
|
||||
unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
|
||||
if (((EltSize == 16 && TypeSize == 32) ||
|
||||
(EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
|
||||
TypeSize * NumElems == 128)
|
||||
ExtSize = TypeSize;
|
||||
}
|
||||
}
|
||||
|
||||
if (ExtSize * NumElems != 128 || NumElems < 4)
|
||||
return ScalarCost;
|
||||
|
||||
// Any (aligned) i32 gather will not need to be scalarised.
|
||||
if (EltSize == 32)
|
||||
if (ExtSize == 32)
|
||||
return VectorCost;
|
||||
// For smaller types, we need to ensure that the gep's inputs are correctly
|
||||
// extended from a small enough value. Other size (including i64) are
|
||||
// extended from a small enough value. Other sizes (including i64) are
|
||||
// scalarized for now.
|
||||
if (EltSize != 8 && EltSize != 16)
|
||||
if (ExtSize != 8 && ExtSize != 16)
|
||||
return ScalarCost;
|
||||
|
||||
if (auto BC = dyn_cast<BitCastInst>(Ptr))
|
||||
@ -911,12 +947,13 @@ unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||
return ScalarCost;
|
||||
unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
|
||||
// Scale needs to be correct (which is only relevant for i16s).
|
||||
if (Scale != 1 && Scale * 8 != EltSize)
|
||||
if (Scale != 1 && Scale * 8 != ExtSize)
|
||||
return ScalarCost;
|
||||
// And we need to zext (not sext) the indexes from a small enough type.
|
||||
if (auto ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1)))
|
||||
if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= EltSize)
|
||||
if (auto ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
|
||||
if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
|
||||
return VectorCost;
|
||||
}
|
||||
return ScalarCost;
|
||||
}
|
||||
return ScalarCost;
|
||||
|
@ -222,7 +222,8 @@ public:
|
||||
bool UseMaskForGaps = false);
|
||||
|
||||
unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
|
||||
bool VariableMask, unsigned Alignment);
|
||||
bool VariableMask, unsigned Alignment,
|
||||
const Instruction *I = nullptr);
|
||||
|
||||
bool isLoweredToCall(const Function *F);
|
||||
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
|
||||
|
@ -131,19 +131,23 @@ unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy,
|
||||
}
|
||||
|
||||
unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF) {
|
||||
return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
|
||||
ArrayRef<Value *> Args,
|
||||
FastMathFlags FMF, unsigned VF,
|
||||
const Instruction *I) {
|
||||
return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I);
|
||||
}
|
||||
|
||||
unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Type*> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed) {
|
||||
ArrayRef<Type *> Tys,
|
||||
FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed,
|
||||
const Instruction *I) {
|
||||
if (ID == Intrinsic::bswap) {
|
||||
std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, RetTy);
|
||||
return LT.first + 2;
|
||||
}
|
||||
return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
|
||||
ScalarizationCostPassed);
|
||||
ScalarizationCostPassed, I);
|
||||
}
|
||||
|
||||
unsigned HexagonTTIImpl::getAddressComputationCost(Type *Tp,
|
||||
@ -209,9 +213,11 @@ unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
|
||||
}
|
||||
|
||||
unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
|
||||
Value *Ptr, bool VariableMask, unsigned Alignment) {
|
||||
Value *Ptr, bool VariableMask,
|
||||
unsigned Alignment,
|
||||
const Instruction *I) {
|
||||
return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
|
||||
Alignment);
|
||||
Alignment, I);
|
||||
}
|
||||
|
||||
unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
|
||||
|
@ -106,10 +106,12 @@ public:
|
||||
unsigned VF);
|
||||
unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type*> Tys);
|
||||
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF);
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF,
|
||||
unsigned VF, const Instruction *I);
|
||||
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Type*> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed = UINT_MAX);
|
||||
ArrayRef<Type *> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed = UINT_MAX,
|
||||
const Instruction *I = nullptr);
|
||||
unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
|
||||
const SCEV *S);
|
||||
unsigned getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
|
||||
@ -120,7 +122,8 @@ public:
|
||||
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
|
||||
Type *SubTp);
|
||||
unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
|
||||
bool VariableMask, unsigned Alignment);
|
||||
bool VariableMask, unsigned Alignment,
|
||||
const Instruction *I);
|
||||
unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||
unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
|
||||
unsigned AddressSpace, bool UseMaskForCond = false,
|
||||
|
@ -936,17 +936,21 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
|
||||
}
|
||||
|
||||
unsigned PPCTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF) {
|
||||
return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
|
||||
ArrayRef<Value *> Args,
|
||||
FastMathFlags FMF, unsigned VF,
|
||||
const Instruction *I) {
|
||||
return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I);
|
||||
}
|
||||
|
||||
unsigned PPCTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Type*> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed) {
|
||||
ArrayRef<Type *> Tys,
|
||||
FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed,
|
||||
const Instruction *I) {
|
||||
if (ID == Intrinsic::bswap && ST->hasP9Vector())
|
||||
return TLI->getTypeLegalizationCost(DL, RetTy).first;
|
||||
return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
|
||||
ScalarizationCostPassed);
|
||||
ScalarizationCostPassed, I);
|
||||
}
|
||||
|
||||
bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
|
||||
|
@ -111,10 +111,12 @@ public:
|
||||
bool UseMaskForCond = false,
|
||||
bool UseMaskForGaps = false);
|
||||
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF);
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF,
|
||||
unsigned VF, const Instruction *I = nullptr);
|
||||
unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Type*> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed = UINT_MAX);
|
||||
ArrayRef<Type *> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed = UINT_MAX,
|
||||
const Instruction *I = nullptr);
|
||||
|
||||
/// @}
|
||||
};
|
||||
|
@ -1124,20 +1124,22 @@ static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) {
|
||||
|
||||
int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value *> Args,
|
||||
FastMathFlags FMF, unsigned VF) {
|
||||
FastMathFlags FMF, unsigned VF,
|
||||
const Instruction *I) {
|
||||
int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
|
||||
if (Cost != -1)
|
||||
return Cost;
|
||||
return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
|
||||
return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I);
|
||||
}
|
||||
|
||||
int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys,
|
||||
FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed) {
|
||||
unsigned ScalarizationCostPassed,
|
||||
const Instruction *I) {
|
||||
int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
|
||||
if (Cost != -1)
|
||||
return Cost;
|
||||
return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys,
|
||||
FMF, ScalarizationCostPassed);
|
||||
return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
|
||||
ScalarizationCostPassed, I);
|
||||
}
|
||||
|
@ -101,10 +101,11 @@ public:
|
||||
|
||||
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF,
|
||||
unsigned VF = 1);
|
||||
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed = UINT_MAX);
|
||||
unsigned VF = 1, const Instruction *I = nullptr);
|
||||
int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys,
|
||||
FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed = UINT_MAX,
|
||||
const Instruction *I = nullptr);
|
||||
/// @}
|
||||
};
|
||||
|
||||
|
@ -1887,7 +1887,8 @@ unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
|
||||
|
||||
int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed) {
|
||||
unsigned ScalarizationCostPassed,
|
||||
const Instruction *I) {
|
||||
// Costs should match the codegen from:
|
||||
// BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
|
||||
// BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
|
||||
@ -2309,12 +2310,13 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
return LT.first * Entry->Cost;
|
||||
}
|
||||
|
||||
return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
|
||||
return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF,
|
||||
ScalarizationCostPassed, I);
|
||||
}
|
||||
|
||||
int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF,
|
||||
unsigned VF) {
|
||||
unsigned VF, const Instruction *I) {
|
||||
static const CostTblEntry AVX512CostTbl[] = {
|
||||
{ ISD::ROTL, MVT::v8i64, 1 },
|
||||
{ ISD::ROTL, MVT::v4i64, 1 },
|
||||
@ -2404,7 +2406,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
return LT.first * Entry->Cost;
|
||||
}
|
||||
|
||||
return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
|
||||
return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF, I);
|
||||
}
|
||||
|
||||
int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
|
||||
@ -3354,7 +3356,8 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
|
||||
/// Calculate the cost of Gather / Scatter operation
|
||||
int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
|
||||
Value *Ptr, bool VariableMask,
|
||||
unsigned Alignment) {
|
||||
unsigned Alignment,
|
||||
const Instruction *I = nullptr) {
|
||||
assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
|
||||
unsigned VF = SrcVTy->getVectorNumElements();
|
||||
PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
|
||||
|
@ -138,7 +138,8 @@ public:
|
||||
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
||||
unsigned AddressSpace);
|
||||
int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
|
||||
bool VariableMask, unsigned Alignment);
|
||||
bool VariableMask, unsigned Alignment,
|
||||
const Instruction *I);
|
||||
int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
|
||||
const SCEV *Ptr);
|
||||
|
||||
@ -146,10 +147,11 @@ public:
|
||||
|
||||
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
ArrayRef<Type *> Tys, FastMathFlags FMF,
|
||||
unsigned ScalarizationCostPassed = UINT_MAX);
|
||||
unsigned ScalarizationCostPassed = UINT_MAX,
|
||||
const Instruction *I = nullptr);
|
||||
int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
|
||||
ArrayRef<Value *> Args, FastMathFlags FMF,
|
||||
unsigned VF = 1);
|
||||
unsigned VF = 1, const Instruction *I = nullptr);
|
||||
|
||||
int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
|
||||
bool IsPairwiseForm);
|
||||
|
@ -3301,7 +3301,7 @@ unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
|
||||
FMF = FPMO->getFastMathFlags();
|
||||
|
||||
SmallVector<Value *, 4> Operands(CI->arg_operands());
|
||||
return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
|
||||
return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF, CI);
|
||||
}
|
||||
|
||||
static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
|
||||
@ -5889,7 +5889,7 @@ unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
|
||||
return TTI.getAddressComputationCost(VectorTy) +
|
||||
TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
|
||||
Legal->isMaskRequired(I),
|
||||
Alignment ? Alignment->value() : 0);
|
||||
Alignment ? Alignment->value() : 0, I);
|
||||
}
|
||||
|
||||
unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
|
||||
|
@ -134,21 +134,26 @@ define void @gep_v4i32(i32* %base, i16* %base16, i8* %base8, <4 x i32> %ind32, <
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i32, i32* %base, <4 x i32> %ind32
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res1 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep1, i32 4, <4 x i1> %mask, <4 x i32> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res1, <4 x i32*> %gep1, i32 4, <4 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i32, i32* %base, <4 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res2 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep2, i32 4, <4 x i1> %mask, <4 x i32> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res2, <4 x i32*> %gep2, i32 4, <4 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i32, i32* %base, <4 x i32> %indsext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep3, i32 4, <4 x i1> %mask, <4 x i32> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %res3, <4 x i32*> %gep3, i32 4, <4 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepu = getelementptr i32, i32* %base, <4 x i32> %ind32
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %resu = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gepu, i32 1, <4 x i1> %mask, <4 x i32> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resu, <4 x i32*> %gepu, i32 1, <4 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <4 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <4 x i8*> %gepos to <4 x i32*>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %resos = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %geposb, i32 4, <4 x i1> %mask, <4 x i32> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %resos, <4 x i32*> %geposb, i32 4, <4 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, i16* %base16, <4 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <4 x i16*> %gepbs to <4 x i32*>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %resbs = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gepbsb, i32 4, <4 x i1> %mask, <4 x i32> undef)
|
||||
@ -193,21 +198,26 @@ define void @gep_v4f32(float* %base, i16* %base16, i8* %base8, <4 x i32> %ind32,
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep1 = getelementptr float, float* %base, <4 x i32> %ind32
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res1 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep1, i32 4, <4 x i1> %mask, <4 x float> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res1, <4 x float*> %gep1, i32 4, <4 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr float, float* %base, <4 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res2 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep2, i32 4, <4 x i1> %mask, <4 x float> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res2, <4 x float*> %gep2, i32 4, <4 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr float, float* %base, <4 x i32> %indsext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep3, i32 4, <4 x i1> %mask, <4 x float> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res3, <4 x float*> %gep3, i32 4, <4 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gepu = getelementptr float, float* %base, <4 x i32> %ind32
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gepu, i32 1, <4 x i1> %mask, <4 x float> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resu, <4 x float*> %gepu, i32 1, <4 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <4 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <4 x i8*> %gepos to <4 x float*>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %geposb, i32 4, <4 x i1> %mask, <4 x float> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resos, <4 x float*> %geposb, i32 4, <4 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, i16* %base16, <4 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <4 x i16*> %gepbs to <4 x float*>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %resbs = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gepbsb, i32 4, <4 x i1> %mask, <4 x float> undef)
|
||||
@ -252,14 +262,28 @@ define void @gep_v4i16(i16* %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i16, i16* %base, <4 x i32> %ind32
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %res1 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep1, i32 2, <4 x i1> %mask, <4 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res1, <4 x i16*> %gep1, i32 2, <4 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indzext = zext <4 x i16> %ind16 to <4 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i16, i16* %base, <4 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %res2 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep2, i32 2, <4 x i1> %mask, <4 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res2, <4 x i16*> %gep2, i32 2, <4 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indsext = sext <4 x i16> %ind16 to <4 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i16, i16* %base, <4 x i32> %indsext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep3, i32 2, <4 x i1> %mask, <4 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res3, <4 x i16*> %gep3, i32 2, <4 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep5 = getelementptr i16, i16* %base, <4 x i16> %ind16
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res5zext = zext <4 x i16> %res5 to <4 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res5trunc = trunc <4 x i32> %res5zext to <4 x i16>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res5trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res6sext = sext <4 x i16> %res6 to <4 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res6trunc = trunc <4 x i32> %res6sext to <4 x i16>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res6trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask)
|
||||
;
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
;
|
||||
%gep1 = getelementptr i16, i16* %base, <4 x i32> %ind32
|
||||
@ -275,43 +299,109 @@ define void @gep_v4i16(i16* %base, <4 x i32> %ind32, <4 x i16> %ind16, <4 x i1>
|
||||
%gep3 = getelementptr i16, i16* %base, <4 x i32> %indsext
|
||||
%res3 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep3, i32 2, <4 x i1> %mask, <4 x i16> undef)
|
||||
call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res3, <4 x i16*> %gep3, i32 2, <4 x i1> %mask)
|
||||
|
||||
; result zext
|
||||
%gep5 = getelementptr i16, i16* %base, <4 x i16> %ind16
|
||||
%res5 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
|
||||
%res5zext = zext <4 x i16> %res5 to <4 x i32>
|
||||
%res5trunc = trunc <4 x i32> %res5zext to <4 x i16>
|
||||
call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res5trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask)
|
||||
|
||||
; result sext
|
||||
%res6 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %gep5, i32 2, <4 x i1> %mask, <4 x i16> undef)
|
||||
%res6sext = sext <4 x i16> %res6 to <4 x i32>
|
||||
%res6trunc = trunc <4 x i32> %res6sext to <4 x i16>
|
||||
call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %res6trunc, <4 x i16*> %gep5, i32 4, <4 x i1> %mask)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @gep_v8i16(i16* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i1> %mask) {
|
||||
define void @gep_v4i8(i8* %base, <4 x i8> %ind8, <4 x i1> %mask) {
|
||||
; CHECK-LABEL: 'gep_v4i8'
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep5 = getelementptr i8, i8* %base, <4 x i8> %ind8
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res5zext = zext <4 x i8> %res5 to <4 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res5trunc = trunc <4 x i32> %res5zext to <4 x i8>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res5trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %res6sext = sext <4 x i8> %res6 to <4 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res6trunc = trunc <4 x i32> %res6sext to <4 x i8>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res6trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask)
|
||||
;
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
;
|
||||
|
||||
; result zext
|
||||
%gep5 = getelementptr i8, i8* %base, <4 x i8> %ind8
|
||||
%res5 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
|
||||
%res5zext = zext <4 x i8> %res5 to <4 x i32>
|
||||
%res5trunc = trunc <4 x i32> %res5zext to <4 x i8>
|
||||
call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res5trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask)
|
||||
|
||||
; result sext
|
||||
%res6 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %gep5, i32 2, <4 x i1> %mask, <4 x i8> undef)
|
||||
%res6sext = sext <4 x i8> %res6 to <4 x i32>
|
||||
%res6trunc = trunc <4 x i32> %res6sext to <4 x i8>
|
||||
call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %res6trunc, <4 x i8*> %gep5, i32 4, <4 x i1> %mask)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @gep_v8i16(i16* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i8> %ind8, <8 x i1> %mask) {
|
||||
; CHECK-LABEL: 'gep_v8i16'
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i16, i16* %base, <8 x i32> %ind32
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res1, <8 x i16*> %gep1, i32 2, <8 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i16, i16* %base, <8 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res2, <8 x i16*> %gep2, i32 2, <8 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i16, i16* %base, <8 x i32> %indsext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res3, <8 x i16*> %gep3, i32 2, <8 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %resu = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 1, <8 x i1> %mask, <8 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resu, <8 x i16*> %gep2, i32 1, <8 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <8 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x i8*> %gepos to <8 x i16*>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %geposb, i32 2, <8 x i1> %mask, <8 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resos, <8 x i16*> %geposb, i32 2, <8 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, i32* %base32, <8 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x i32*> %gepbs to <8 x i16*>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resbs, <8 x i16*> %gepbsb, i32 2, <8 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indzext4 = zext <8 x i16> %ind16 to <8 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep4 = getelementptr i16, i16* %base, <8 x i32> %indzext4
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %indtrunc = trunc <8 x i32> %ind32 to <8 x i16>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %indtrunc, <8 x i16*> %gep4, i32 2, <8 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep4, i32 2, <8 x i1> %mask, <8 x i16> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %ressext = sext <8 x i16> %res to <8 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %restrunc = trunc <8 x i32> %ressext to <8 x i16>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %restrunc, <8 x i16*> %gep4, i32 4, <8 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
;
|
||||
; no offset ext
|
||||
%gep1 = getelementptr i16, i16* %base, <8 x i32> %ind32
|
||||
%res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef)
|
||||
call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res1, <8 x i16*> %gep1, i32 2, <8 x i1> %mask)
|
||||
|
||||
; offset zext
|
||||
%indzext = zext <8 x i16> %ind16 to <8 x i32>
|
||||
%gep2 = getelementptr i16, i16* %base, <8 x i32> %indzext
|
||||
%res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef)
|
||||
call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res2, <8 x i16*> %gep2, i32 2, <8 x i1> %mask)
|
||||
|
||||
; offset sext
|
||||
%indsext = sext <8 x i16> %ind16 to <8 x i32>
|
||||
%gep3 = getelementptr i16, i16* %base, <8 x i32> %indsext
|
||||
%res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef)
|
||||
@ -332,6 +422,19 @@ define void @gep_v8i16(i16* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, <
|
||||
%gepbsb = bitcast <8 x i32*> %gepbs to <8 x i16*>
|
||||
%resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef)
|
||||
call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resbs, <8 x i16*> %gepbsb, i32 2, <8 x i1> %mask)
|
||||
|
||||
; trunc scatter
|
||||
%indzext4 = zext <8 x i16> %ind16 to <8 x i32>
|
||||
%gep4 = getelementptr i16, i16* %base, <8 x i32> %indzext4
|
||||
%indtrunc = trunc <8 x i32> %ind32 to <8 x i16>
|
||||
call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %indtrunc, <8 x i16*> %gep4, i32 2, <8 x i1> %mask)
|
||||
|
||||
; ext result to <8 x i32>
|
||||
%res = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep4, i32 2, <8 x i1> %mask, <8 x i16> undef)
|
||||
%ressext = sext <8 x i16> %res to <8 x i32>
|
||||
%restrunc = trunc <8 x i32> %ressext to <8 x i16>
|
||||
call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %restrunc, <8 x i16*> %gep4, i32 4, <8 x i1> %mask)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -340,35 +443,44 @@ define void @gep_v8f16(half* %base, i8* %base8, i32* %base32, <8 x i32> %ind32,
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep1 = getelementptr half, half* %base, <8 x i32> %ind32
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res1, <8 x half*> %gep1, i32 2, <8 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr half, half* %base, <8 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res2, <8 x half*> %gep2, i32 2, <8 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr half, half* %base, <8 x i32> %indsext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res3, <8 x half*> %gep3, i32 2, <8 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %resu = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 1, <8 x i1> %mask, <8 x half> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resu, <8 x half*> %gep2, i32 1, <8 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <8 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x i8*> %gepos to <8 x half*>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %geposb, i32 2, <8 x i1> %mask, <8 x half> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resos, <8 x half*> %geposb, i32 2, <8 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, i32* %base32, <8 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x i32*> %gepbs to <8 x half*>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resbs, <8 x half*> %gepbsb, i32 2, <8 x i1> %mask)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
;
|
||||
|
||||
; no offset ext
|
||||
%gep1 = getelementptr half, half* %base, <8 x i32> %ind32
|
||||
%res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef)
|
||||
call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res1, <8 x half*> %gep1, i32 2, <8 x i1> %mask)
|
||||
|
||||
; offset zext
|
||||
%indzext = zext <8 x i16> %ind16 to <8 x i32>
|
||||
%gep2 = getelementptr half, half* %base, <8 x i32> %indzext
|
||||
%res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef)
|
||||
call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res2, <8 x half*> %gep2, i32 2, <8 x i1> %mask)
|
||||
|
||||
; offset sext
|
||||
%indsext = sext <8 x i16> %ind16 to <8 x i32>
|
||||
%gep3 = getelementptr half, half* %base, <8 x i32> %indsext
|
||||
%res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef)
|
||||
@ -389,6 +501,42 @@ define void @gep_v8f16(half* %base, i8* %base8, i32* %base32, <8 x i32> %ind32,
|
||||
%gepbsb = bitcast <8 x i32*> %gepbs to <8 x half*>
|
||||
%resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef)
|
||||
call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resbs, <8 x half*> %gepbsb, i32 2, <8 x i1> %mask)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @gep_v8i8(i8* %base, <8 x i8> %ind8, <8 x i1> %mask) {
|
||||
; CHECK-LABEL: 'gep_v8i8'
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %indzext = zext <8 x i8> %ind8 to <8 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep5 = getelementptr i8, i8* %base, <8 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res5zext = zext <8 x i8> %res5 to <8 x i16>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res5trunc = trunc <8 x i16> %res5zext to <8 x i8>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res5trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res6sext = sext <8 x i8> %res6 to <8 x i16>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %res6trunc = trunc <8 x i16> %res6sext to <8 x i8>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res6trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
;
|
||||
|
||||
; result zext
|
||||
%indzext = zext <8 x i8> %ind8 to <8 x i32>
|
||||
%gep5 = getelementptr i8, i8* %base, <8 x i32> %indzext
|
||||
%res5 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
|
||||
%res5zext = zext <8 x i8> %res5 to <8 x i16>
|
||||
%res5trunc = trunc <8 x i16> %res5zext to <8 x i8>
|
||||
call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res5trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask)
|
||||
|
||||
; result sext
|
||||
%res6 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %gep5, i32 2, <8 x i1> %mask, <8 x i8> undef)
|
||||
%res6sext = sext <8 x i8> %res6 to <8 x i16>
|
||||
%res6trunc = trunc <8 x i16> %res6sext to <8 x i8>
|
||||
call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %res6trunc, <8 x i8*> %gep5, i32 4, <8 x i1> %mask)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
@ -397,29 +545,40 @@ define void @gep_v16i8(i8* %base, i16* %base16, <16 x i8> %ind8, <16 x i32> %ind
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i8, i8* %base, <16 x i32> %ind32
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res1, <16 x i8*> %gep1, i32 2, <16 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %indzext = zext <16 x i8> %ind8 to <16 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i8, i8* %base, <16 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res2, <16 x i8*> %gep2, i32 2, <16 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %indsext = sext <16 x i8> %ind8 to <16 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i8, i8* %base, <16 x i32> %indsext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res3, <16 x i8*> %gep3, i32 2, <16 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, i16* %base16, <16 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <16 x i16*> %gepbs to <16 x i8*>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %resbs, <16 x i8*> %gepbsb, i32 2, <16 x i1> %mask)
|
||||
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %indzext4 = zext <16 x i8> %ind8 to <16 x i32>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep4 = getelementptr i8, i8* %base, <16 x i32> %indzext
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indtrunc = trunc <16 x i32> %ind32 to <16 x i8>
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %indtrunc, <16 x i8*> %gep4, i32 2, <16 x i1> %mask)
|
||||
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
;
|
||||
; no offset ext
|
||||
%gep1 = getelementptr i8, i8* %base, <16 x i32> %ind32
|
||||
%res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef)
|
||||
call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res1, <16 x i8*> %gep1, i32 2, <16 x i1> %mask)
|
||||
|
||||
; offset zext
|
||||
%indzext = zext <16 x i8> %ind8 to <16 x i32>
|
||||
%gep2 = getelementptr i8, i8* %base, <16 x i32> %indzext
|
||||
%res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef)
|
||||
call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res2, <16 x i8*> %gep2, i32 2, <16 x i1> %mask)
|
||||
|
||||
; offset sext
|
||||
%indsext = sext <16 x i8> %ind8 to <16 x i32>
|
||||
%gep3 = getelementptr i8, i8* %base, <16 x i32> %indsext
|
||||
%res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef)
|
||||
@ -430,6 +589,13 @@ define void @gep_v16i8(i8* %base, i16* %base16, <16 x i8> %ind8, <16 x i32> %ind
|
||||
%gepbsb = bitcast <16 x i16*> %gepbs to <16 x i8*>
|
||||
%resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef)
|
||||
call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %resbs, <16 x i8*> %gepbsb, i32 2, <16 x i1> %mask)
|
||||
|
||||
; trunc scatter
|
||||
%indzext4 = zext <16 x i8> %ind8 to <16 x i32>
|
||||
%gep4 = getelementptr i8, i8* %base, <16 x i32> %indzext
|
||||
%indtrunc = trunc <16 x i32> %ind32 to <16 x i8>
|
||||
call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %indtrunc, <16 x i8*> %gep4, i32 2, <16 x i1> %mask)
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user