mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 11:13:28 +01:00
[Vectorizer] Add a new 'OperandValueKind' in TargetTransformInfo called
'OK_NonUniformConstValue' to identify operands which are constants but not constant splats. The cost model now allows returning 'OK_NonUniformConstValue' for non splat operands that are instances of ConstantVector or ConstantDataVector. With this change, targets are now able to compute different costs for instructions with non-uniform constant operands. For example, On X86 the cost of a vector shift may vary depending on whether the second operand is a uniform or non-uniform constant. This patch applies the following changes: - The cost model computation now takes into account non-uniform constants; - The cost of vector shift instructions has been improved in X86TargetTransformInfo analysis pass; - BBVectorize, SLPVectorizer and LoopVectorize now know how to distinguish between non-uniform and uniform constant operands. Added a new test to verify that the output of opt '-cost-model -analyze' is valid in the following configurations: SSE2, SSE4.1, AVX, AVX2. llvm-svn: 201272
This commit is contained in:
parent
b682c0a265
commit
594ea331ef
@ -323,7 +323,8 @@ public:
|
|||||||
enum OperandValueKind {
|
enum OperandValueKind {
|
||||||
OK_AnyValue, // Operand can have any value.
|
OK_AnyValue, // Operand can have any value.
|
||||||
OK_UniformValue, // Operand is uniform (splat of a value).
|
OK_UniformValue, // Operand is uniform (splat of a value).
|
||||||
OK_UniformConstantValue // Operand is uniform constant.
|
OK_UniformConstantValue, // Operand is uniform constant.
|
||||||
|
OK_NonUniformConstantValue // Operand is a non uniform constant value.
|
||||||
};
|
};
|
||||||
|
|
||||||
/// \return The number of scalar or vector registers that the target has.
|
/// \return The number of scalar or vector registers that the target has.
|
||||||
|
@ -98,15 +98,20 @@ static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) {
|
|||||||
TargetTransformInfo::OperandValueKind OpInfo =
|
TargetTransformInfo::OperandValueKind OpInfo =
|
||||||
TargetTransformInfo::OK_AnyValue;
|
TargetTransformInfo::OK_AnyValue;
|
||||||
|
|
||||||
// Check for a splat of a constant.
|
// Check for a splat of a constant or for a non uniform vector of constants.
|
||||||
ConstantDataVector *CDV = 0;
|
ConstantDataVector *CDV = 0;
|
||||||
if ((CDV = dyn_cast<ConstantDataVector>(V)))
|
if ((CDV = dyn_cast<ConstantDataVector>(V))) {
|
||||||
|
OpInfo = TargetTransformInfo::OK_NonUniformConstantValue;
|
||||||
if (CDV->getSplatValue() != NULL)
|
if (CDV->getSplatValue() != NULL)
|
||||||
OpInfo = TargetTransformInfo::OK_UniformConstantValue;
|
OpInfo = TargetTransformInfo::OK_UniformConstantValue;
|
||||||
|
}
|
||||||
|
|
||||||
ConstantVector *CV = 0;
|
ConstantVector *CV = 0;
|
||||||
if ((CV = dyn_cast<ConstantVector>(V)))
|
if ((CV = dyn_cast<ConstantVector>(V))) {
|
||||||
|
OpInfo = TargetTransformInfo::OK_NonUniformConstantValue;
|
||||||
if (CV->getSplatValue() != NULL)
|
if (CV->getSplatValue() != NULL)
|
||||||
OpInfo = TargetTransformInfo::OK_UniformConstantValue;
|
OpInfo = TargetTransformInfo::OK_UniformConstantValue;
|
||||||
|
}
|
||||||
|
|
||||||
return OpInfo;
|
return OpInfo;
|
||||||
}
|
}
|
||||||
|
@ -225,6 +225,13 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
|
|||||||
|
|
||||||
// Look for AVX2 lowering tricks.
|
// Look for AVX2 lowering tricks.
|
||||||
if (ST->hasAVX2()) {
|
if (ST->hasAVX2()) {
|
||||||
|
if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
|
||||||
|
(Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
|
||||||
|
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
|
||||||
|
// On AVX2, a packed v16i16 shift left by a constant build_vector
|
||||||
|
// is lowered into a vector multiply (vpmullw).
|
||||||
|
return LT.first;
|
||||||
|
|
||||||
int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second);
|
int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second);
|
||||||
if (Idx != -1)
|
if (Idx != -1)
|
||||||
return LT.first * AVX2CostTable[Idx].Cost;
|
return LT.first * AVX2CostTable[Idx].Cost;
|
||||||
@ -257,6 +264,20 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
|
|||||||
return LT.first * SSE2UniformConstCostTable[Idx].Cost;
|
return LT.first * SSE2UniformConstCostTable[Idx].Cost;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ISD == ISD::SHL &&
|
||||||
|
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue) {
|
||||||
|
EVT VT = LT.second;
|
||||||
|
if ((VT == MVT::v8i16 && ST->hasSSE2()) ||
|
||||||
|
(VT == MVT::v4i32 && ST->hasSSE41()))
|
||||||
|
// Vector shift left by non uniform constant can be lowered
|
||||||
|
// into vector multiply (pmullw/pmulld).
|
||||||
|
return LT.first;
|
||||||
|
if (VT == MVT::v4i32 && ST->hasSSE2())
|
||||||
|
// A vector shift left by non uniform constant is converted
|
||||||
|
// into a vector multiply; the new multiply is eventually
|
||||||
|
// lowered into a sequence of shuffles and 2 x pmuludq.
|
||||||
|
ISD = ISD::MUL;
|
||||||
|
}
|
||||||
|
|
||||||
static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = {
|
static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = {
|
||||||
// We don't correctly identify costs of casts because they are marked as
|
// We don't correctly identify costs of casts because they are marked as
|
||||||
@ -271,6 +292,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
|
|||||||
{ ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized.
|
{ ISD::SHL, MVT::v8i16, 8*10 }, // Scalarized.
|
||||||
{ ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
|
{ ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul.
|
||||||
{ ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized.
|
{ ISD::SHL, MVT::v2i64, 2*10 }, // Scalarized.
|
||||||
|
{ ISD::SHL, MVT::v4i64, 4*10 }, // Scalarized.
|
||||||
|
|
||||||
{ ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized.
|
{ ISD::SRL, MVT::v16i8, 16*10 }, // Scalarized.
|
||||||
{ ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized.
|
{ ISD::SRL, MVT::v8i16, 8*10 }, // Scalarized.
|
||||||
@ -308,6 +330,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
|
|||||||
// We don't have to scalarize unsupported ops. We can issue two half-sized
|
// We don't have to scalarize unsupported ops. We can issue two half-sized
|
||||||
// operations and we only need to extract the upper YMM half.
|
// operations and we only need to extract the upper YMM half.
|
||||||
// Two ops + 1 extract + 1 insert = 4.
|
// Two ops + 1 extract + 1 insert = 4.
|
||||||
|
{ ISD::MUL, MVT::v16i16, 4 },
|
||||||
{ ISD::MUL, MVT::v8i32, 4 },
|
{ ISD::MUL, MVT::v8i32, 4 },
|
||||||
{ ISD::SUB, MVT::v8i32, 4 },
|
{ ISD::SUB, MVT::v8i32, 4 },
|
||||||
{ ISD::ADD, MVT::v8i32, 4 },
|
{ ISD::ADD, MVT::v8i32, 4 },
|
||||||
@ -323,7 +346,15 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
|
|||||||
|
|
||||||
// Look for AVX1 lowering tricks.
|
// Look for AVX1 lowering tricks.
|
||||||
if (ST->hasAVX() && !ST->hasAVX2()) {
|
if (ST->hasAVX() && !ST->hasAVX2()) {
|
||||||
int Idx = CostTableLookup(AVX1CostTable, ISD, LT.second);
|
EVT VT = LT.second;
|
||||||
|
|
||||||
|
// v16i16 and v8i32 shifts by non-uniform constants are lowered into a
|
||||||
|
// sequence of extract + two vector multiply + insert.
|
||||||
|
if (ISD == ISD::SHL && (VT == MVT::v8i32 || VT == MVT::v16i16) &&
|
||||||
|
Op2Info == TargetTransformInfo::OK_NonUniformConstantValue)
|
||||||
|
ISD = ISD::MUL;
|
||||||
|
|
||||||
|
int Idx = CostTableLookup(AVX1CostTable, ISD, VT);
|
||||||
if (Idx != -1)
|
if (Idx != -1)
|
||||||
return LT.first * AVX1CostTable[Idx].Cost;
|
return LT.first * AVX1CostTable[Idx].Cost;
|
||||||
}
|
}
|
||||||
@ -343,7 +374,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
|
|||||||
// 2x pmuludq, 2x shuffle.
|
// 2x pmuludq, 2x shuffle.
|
||||||
if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
|
if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
|
||||||
!ST->hasSSE41())
|
!ST->hasSSE41())
|
||||||
return 6;
|
return LT.first * 6;
|
||||||
|
|
||||||
// Fallback to the default implementation.
|
// Fallback to the default implementation.
|
||||||
return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
|
return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
|
||||||
|
@ -532,7 +532,11 @@ namespace {
|
|||||||
|
|
||||||
// Returns the cost of the provided instruction using TTI.
|
// Returns the cost of the provided instruction using TTI.
|
||||||
// This does not handle loads and stores.
|
// This does not handle loads and stores.
|
||||||
unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) {
|
unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2,
|
||||||
|
TargetTransformInfo::OperandValueKind Op1VK =
|
||||||
|
TargetTransformInfo::OK_AnyValue,
|
||||||
|
TargetTransformInfo::OperandValueKind Op2VK =
|
||||||
|
TargetTransformInfo::OK_AnyValue) {
|
||||||
switch (Opcode) {
|
switch (Opcode) {
|
||||||
default: break;
|
default: break;
|
||||||
case Instruction::GetElementPtr:
|
case Instruction::GetElementPtr:
|
||||||
@ -562,7 +566,7 @@ namespace {
|
|||||||
case Instruction::And:
|
case Instruction::And:
|
||||||
case Instruction::Or:
|
case Instruction::Or:
|
||||||
case Instruction::Xor:
|
case Instruction::Xor:
|
||||||
return TTI->getArithmeticInstrCost(Opcode, T1);
|
return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK);
|
||||||
case Instruction::Select:
|
case Instruction::Select:
|
||||||
case Instruction::ICmp:
|
case Instruction::ICmp:
|
||||||
case Instruction::FCmp:
|
case Instruction::FCmp:
|
||||||
@ -1013,13 +1017,58 @@ namespace {
|
|||||||
unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
|
unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
|
||||||
Type *VT1 = getVecTypeForPair(IT1, JT1),
|
Type *VT1 = getVecTypeForPair(IT1, JT1),
|
||||||
*VT2 = getVecTypeForPair(IT2, JT2);
|
*VT2 = getVecTypeForPair(IT2, JT2);
|
||||||
|
TargetTransformInfo::OperandValueKind Op1VK =
|
||||||
|
TargetTransformInfo::OK_AnyValue;
|
||||||
|
TargetTransformInfo::OperandValueKind Op2VK =
|
||||||
|
TargetTransformInfo::OK_AnyValue;
|
||||||
|
|
||||||
|
// On some targets (example X86) the cost of a vector shift may vary
|
||||||
|
// depending on whether the second operand is a Uniform or
|
||||||
|
// NonUniform Constant.
|
||||||
|
switch (I->getOpcode()) {
|
||||||
|
default : break;
|
||||||
|
case Instruction::Shl:
|
||||||
|
case Instruction::LShr:
|
||||||
|
case Instruction::AShr:
|
||||||
|
|
||||||
|
// If both I and J are scalar shifts by constant, then the
|
||||||
|
// merged vector shift count would be either a constant splat value
|
||||||
|
// or a non-uniform vector of constants.
|
||||||
|
if (ConstantInt *CII = dyn_cast<ConstantInt>(I->getOperand(1))) {
|
||||||
|
if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1)))
|
||||||
|
Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue :
|
||||||
|
TargetTransformInfo::OK_NonUniformConstantValue;
|
||||||
|
} else {
|
||||||
|
// Check for a splat of a constant or for a non uniform vector
|
||||||
|
// of constants.
|
||||||
|
Value *IOp = I->getOperand(1);
|
||||||
|
Value *JOp = J->getOperand(1);
|
||||||
|
if (ConstantDataVector *CDVI = dyn_cast<ConstantDataVector>(IOp)) {
|
||||||
|
if (ConstantDataVector *CDVJ = dyn_cast<ConstantDataVector>(JOp)) {
|
||||||
|
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
|
||||||
|
Constant *SplatValue = CDVI->getSplatValue();
|
||||||
|
if (SplatValue != NULL && SplatValue == CDVJ->getSplatValue())
|
||||||
|
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ConstantVector *CVI = dyn_cast<ConstantVector>(IOp)) {
|
||||||
|
if (ConstantVector *CVJ = dyn_cast<ConstantVector>(JOp)) {
|
||||||
|
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
|
||||||
|
Constant *SplatValue = CVI->getSplatValue();
|
||||||
|
if (SplatValue != NULL && SplatValue == CVJ->getSplatValue())
|
||||||
|
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Note that this procedure is incorrect for insert and extract element
|
// Note that this procedure is incorrect for insert and extract element
|
||||||
// instructions (because combining these often results in a shuffle),
|
// instructions (because combining these often results in a shuffle),
|
||||||
// but this cost is ignored (because insert and extract element
|
// but this cost is ignored (because insert and extract element
|
||||||
// instructions are assigned a zero depth factor and are not really
|
// instructions are assigned a zero depth factor and are not really
|
||||||
// fused in general).
|
// fused in general).
|
||||||
unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2);
|
unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK);
|
||||||
|
|
||||||
if (VCost > ICost + JCost)
|
if (VCost > ICost + JCost)
|
||||||
return false;
|
return false;
|
||||||
|
@ -5491,9 +5491,20 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
|
|||||||
TargetTransformInfo::OK_AnyValue;
|
TargetTransformInfo::OK_AnyValue;
|
||||||
TargetTransformInfo::OperandValueKind Op2VK =
|
TargetTransformInfo::OperandValueKind Op2VK =
|
||||||
TargetTransformInfo::OK_AnyValue;
|
TargetTransformInfo::OK_AnyValue;
|
||||||
|
Value *Op2 = I->getOperand(1);
|
||||||
|
|
||||||
if (isa<ConstantInt>(I->getOperand(1)))
|
// Check for a splat of a constant or for a non uniform vector of constants.
|
||||||
|
if (isa<ConstantInt>(Op2))
|
||||||
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
|
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
|
||||||
|
else if (ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(Op2)) {
|
||||||
|
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
|
||||||
|
if (CDV->getSplatValue() != NULL)
|
||||||
|
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
|
||||||
|
} else if (ConstantVector *CV = dyn_cast<ConstantVector>(Op2)) {
|
||||||
|
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
|
||||||
|
if (CV->getSplatValue() != NULL)
|
||||||
|
Op2VK = TargetTransformInfo::OK_UniformConstantValue;
|
||||||
|
}
|
||||||
|
|
||||||
return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
|
return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
|
||||||
}
|
}
|
||||||
|
@ -1044,12 +1044,26 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||||||
TargetTransformInfo::OperandValueKind Op2VK =
|
TargetTransformInfo::OperandValueKind Op2VK =
|
||||||
TargetTransformInfo::OK_UniformConstantValue;
|
TargetTransformInfo::OK_UniformConstantValue;
|
||||||
|
|
||||||
// Check whether all second operands are constant.
|
// If all operands are exactly the same ConstantInt then set the
|
||||||
for (unsigned i = 0; i < VL.size(); ++i)
|
// operand kind to OK_UniformConstantValue.
|
||||||
if (!isa<ConstantInt>(cast<Instruction>(VL[i])->getOperand(1))) {
|
// If instead not all operands are constants, then set the operand kind
|
||||||
|
// to OK_AnyValue. If all operands are constants but not the same,
|
||||||
|
// then set the operand kind to OK_NonUniformConstantValue.
|
||||||
|
ConstantInt *CInt = NULL;
|
||||||
|
for (unsigned i = 0; i < VL.size(); ++i) {
|
||||||
|
const Instruction *I = cast<Instruction>(VL[i]);
|
||||||
|
if (!isa<ConstantInt>(I->getOperand(1))) {
|
||||||
Op2VK = TargetTransformInfo::OK_AnyValue;
|
Op2VK = TargetTransformInfo::OK_AnyValue;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
if (i == 0) {
|
||||||
|
CInt = cast<ConstantInt>(I->getOperand(1));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
|
||||||
|
CInt != cast<ConstantInt>(I->getOperand(1)))
|
||||||
|
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
|
||||||
|
}
|
||||||
|
|
||||||
ScalarCost =
|
ScalarCost =
|
||||||
VecTy->getNumElements() *
|
VecTy->getNumElements() *
|
||||||
|
167
test/Analysis/CostModel/X86/vshift-cost.ll
Normal file
167
test/Analysis/CostModel/X86/vshift-cost.ll
Normal file
@ -0,0 +1,167 @@
|
|||||||
|
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
|
||||||
|
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
|
||||||
|
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
|
||||||
|
; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
|
||||||
|
|
||||||
|
|
||||||
|
; Verify the cost of vector shift left instructions.
|
||||||
|
|
||||||
|
; We always emit a single pmullw in the case of v8i16 vector shifts by
|
||||||
|
; non-uniform constant.
|
||||||
|
|
||||||
|
define <8 x i16> @test1(<8 x i16> %a) {
|
||||||
|
%shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
|
||||||
|
ret <8 x i16> %shl
|
||||||
|
}
|
||||||
|
; CHECK: 'Cost Model Analysis' for function 'test1':
|
||||||
|
; CHECK: Found an estimated cost of 1 for instruction: %shl
|
||||||
|
|
||||||
|
|
||||||
|
define <8 x i16> @test2(<8 x i16> %a) {
|
||||||
|
%shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
|
||||||
|
ret <8 x i16> %shl
|
||||||
|
}
|
||||||
|
; CHECK: 'Cost Model Analysis' for function 'test2':
|
||||||
|
; CHECK: Found an estimated cost of 1 for instruction: %shl
|
||||||
|
|
||||||
|
|
||||||
|
; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction.
|
||||||
|
; Make sure that the estimated cost is always 1 except for the case where
|
||||||
|
; we only have SSE2 support. With SSE2, we are forced to special lower the
|
||||||
|
; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle.
|
||||||
|
|
||||||
|
define <4 x i32> @test3(<4 x i32> %a) {
|
||||||
|
%shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
|
||||||
|
ret <4 x i32> %shl
|
||||||
|
}
|
||||||
|
; CHECK: 'Cost Model Analysis' for function 'test3':
|
||||||
|
; SSE2: Found an estimated cost of 6 for instruction: %shl
|
||||||
|
; SSE41: Found an estimated cost of 1 for instruction: %shl
|
||||||
|
; AVX: Found an estimated cost of 1 for instruction: %shl
|
||||||
|
; AVX2: Found an estimated cost of 1 for instruction: %shl
|
||||||
|
|
||||||
|
|
||||||
|
define <4 x i32> @test4(<4 x i32> %a) {
|
||||||
|
%shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
|
||||||
|
ret <4 x i32> %shl
|
||||||
|
}
|
||||||
|
; CHECK: 'Cost Model Analysis' for function 'test4':
|
||||||
|
; SSE2: Found an estimated cost of 6 for instruction: %shl
|
||||||
|
; SSE41: Found an estimated cost of 1 for instruction: %shl
|
||||||
|
; AVX: Found an estimated cost of 1 for instruction: %shl
|
||||||
|
; AVX2: Found an estimated cost of 1 for instruction: %shl
|
||||||
|
|
||||||
|
|
||||||
|
; On AVX2 we are able to lower the following shift into a single
|
||||||
|
; vpsllvq. Therefore, the expected cost is only 1.
|
||||||
|
; In all other cases, this shift is scalarized as the target does not support
|
||||||
|
; vpsllv instructions.
|
||||||
|
|
||||||
|
define <2 x i64> @test5(<2 x i64> %a) {
|
||||||
|
%shl = shl <2 x i64> %a, <i64 2, i64 3>
|
||||||
|
ret <2 x i64> %shl
|
||||||
|
}
|
||||||
|
; CHECK: 'Cost Model Analysis' for function 'test5':
|
||||||
|
; SSE2: Found an estimated cost of 20 for instruction: %shl
|
||||||
|
; SSE41: Found an estimated cost of 20 for instruction: %shl
|
||||||
|
; AVX: Found an estimated cost of 20 for instruction: %shl
|
||||||
|
; AVX2: Found an estimated cost of 1 for instruction: %shl
|
||||||
|
|
||||||
|
|
||||||
|
; v16i16 and v8i32 shift left by non-uniform constant are lowered into
|
||||||
|
; vector multiply instructions. With AVX (but not AVX2), the vector multiply
|
||||||
|
; is lowered into a sequence of: 1 extract + 2 vpmullw + 1 insert.
|
||||||
|
;
|
||||||
|
; With AVX2, instruction vpmullw works with 256bit quantities and
|
||||||
|
; therefore there is no need to split the resulting vector multiply into
|
||||||
|
; a sequence of two multiply.
|
||||||
|
;
|
||||||
|
; With SSE2 and SSE4.1, the vector shift cost for 'test6' is twice
|
||||||
|
; the cost computed in the case of 'test1'. That is because the backend
|
||||||
|
; simply emits 2 pmullw with no extract/insert.
|
||||||
|
|
||||||
|
|
||||||
|
define <16 x i16> @test6(<16 x i16> %a) {
|
||||||
|
%shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
|
||||||
|
ret <16 x i16> %shl
|
||||||
|
}
|
||||||
|
; CHECK: 'Cost Model Analysis' for function 'test6':
|
||||||
|
; SSE2: Found an estimated cost of 2 for instruction: %shl
|
||||||
|
; SSE41: Found an estimated cost of 2 for instruction: %shl
|
||||||
|
; AVX: Found an estimated cost of 4 for instruction: %shl
|
||||||
|
; AVX2: Found an estimated cost of 1 for instruction: %shl
|
||||||
|
|
||||||
|
|
||||||
|
; With SSE2 and SSE4.1, the vector shift cost for 'test7' is twice
|
||||||
|
; the cost computed in the case of 'test3'. That is because the multiply
|
||||||
|
; is type-legalized into two 4i32 vector multiply.
|
||||||
|
|
||||||
|
define <8 x i32> @test7(<8 x i32> %a) {
|
||||||
|
%shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
|
||||||
|
ret <8 x i32> %shl
|
||||||
|
}
|
||||||
|
; CHECK: 'Cost Model Analysis' for function 'test7':
|
||||||
|
; SSE2: Found an estimated cost of 12 for instruction: %shl
|
||||||
|
; SSE41: Found an estimated cost of 2 for instruction: %shl
|
||||||
|
; AVX: Found an estimated cost of 4 for instruction: %shl
|
||||||
|
; AVX2: Found an estimated cost of 1 for instruction: %shl
|
||||||
|
|
||||||
|
|
||||||
|
; On AVX2 we are able to lower the following shift into a single
|
||||||
|
; vpsllvq. Therefore, the expected cost is only 1.
|
||||||
|
; In all other cases, this shift is scalarized as the target does not support
|
||||||
|
; vpsllv instructions.
|
||||||
|
|
||||||
|
define <4 x i64> @test8(<4 x i64> %a) {
|
||||||
|
%shl = shl <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
|
||||||
|
ret <4 x i64> %shl
|
||||||
|
}
|
||||||
|
; CHECK: 'Cost Model Analysis' for function 'test8':
|
||||||
|
; SSE2: Found an estimated cost of 40 for instruction: %shl
|
||||||
|
; SSE41: Found an estimated cost of 40 for instruction: %shl
|
||||||
|
; AVX: Found an estimated cost of 40 for instruction: %shl
|
||||||
|
; AVX2: Found an estimated cost of 1 for instruction: %shl
|
||||||
|
|
||||||
|
|
||||||
|
; Same as 'test6', with the difference that the cost is double.
|
||||||
|
|
||||||
|
define <32 x i16> @test9(<32 x i16> %a) {
|
||||||
|
%shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
|
||||||
|
ret <32 x i16> %shl
|
||||||
|
}
|
||||||
|
; CHECK: 'Cost Model Analysis' for function 'test9':
|
||||||
|
; SSE2: Found an estimated cost of 4 for instruction: %shl
|
||||||
|
; SSE41: Found an estimated cost of 4 for instruction: %shl
|
||||||
|
; AVX: Found an estimated cost of 8 for instruction: %shl
|
||||||
|
; AVX2: Found an estimated cost of 2 for instruction: %shl
|
||||||
|
|
||||||
|
|
||||||
|
; Same as 'test7', except that now the cost is double.
|
||||||
|
|
||||||
|
define <16 x i32> @test10(<16 x i32> %a) {
|
||||||
|
%shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
|
||||||
|
ret <16 x i32> %shl
|
||||||
|
}
|
||||||
|
; CHECK: 'Cost Model Analysis' for function 'test10':
|
||||||
|
; SSE2: Found an estimated cost of 24 for instruction: %shl
|
||||||
|
; SSE41: Found an estimated cost of 4 for instruction: %shl
|
||||||
|
; AVX: Found an estimated cost of 8 for instruction: %shl
|
||||||
|
; AVX2: Found an estimated cost of 2 for instruction: %shl
|
||||||
|
|
||||||
|
|
||||||
|
; On AVX2 we are able to lower the following shift into a sequence of
|
||||||
|
; two vpsllvq instructions. Therefore, the expected cost is only 2.
|
||||||
|
; In all other cases, this shift is scalarized as we don't have vpsllv
|
||||||
|
; instructions.
|
||||||
|
|
||||||
|
define <8 x i64> @test11(<8 x i64> %a) {
|
||||||
|
%shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
|
||||||
|
ret <8 x i64> %shl
|
||||||
|
}
|
||||||
|
; CHECK: 'Cost Model Analysis' for function 'test11':
|
||||||
|
; SSE2: Found an estimated cost of 80 for instruction: %shl
|
||||||
|
; SSE41: Found an estimated cost of 80 for instruction: %shl
|
||||||
|
; AVX: Found an estimated cost of 80 for instruction: %shl
|
||||||
|
; AVX2: Found an estimated cost of 2 for instruction: %shl
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user