mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-18 18:42:46 +02:00
[LoopVectorize] Simplify scalar cost calculation in getInstructionCost
This patch simplifies the calculation of certain costs in getInstructionCost when isScalarAfterVectorization() returns a true value. There are a few places where we multiply a cost by a number N, i.e. unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; return N * TTI.getArithmeticInstrCost(... After some investigation it seems that there are only these cases that occur in practice: 1. VF is a scalar, in which case N = 1. 2. VF is a vector. We can only get here if: a) the instruction is a GEP/bitcast with scalar uses, or b) this is an update to an induction variable that remains scalar. I have changed the code so that N is assumed to always be 1. For GEPs the cost is always 0, since this is calculated later on as part of the load/store cost. For all other cases I have added an assert that none of the users needs scalarising, which didn't fire in any unit tests. Only one test required fixing and I believe the original cost for the scalar add instruction to have been wrong, since only one copy remains after vectorisation. Differential Revision: https://reviews.llvm.org/D98512
This commit is contained in:
parent
5e07382161
commit
4e4f3dfb9b
@ -7253,10 +7253,36 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
|
|||||||
Type *RetTy = I->getType();
|
Type *RetTy = I->getType();
|
||||||
if (canTruncateToMinimalBitwidth(I, VF))
|
if (canTruncateToMinimalBitwidth(I, VF))
|
||||||
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
|
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
|
||||||
VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
|
|
||||||
auto SE = PSE.getSE();
|
auto SE = PSE.getSE();
|
||||||
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
||||||
|
|
||||||
|
auto hasSingleCopyAfterVectorization = [this](Instruction *I,
|
||||||
|
ElementCount VF) -> bool {
|
||||||
|
if (VF.isScalar())
|
||||||
|
return true;
|
||||||
|
|
||||||
|
auto Scalarized = InstsToScalarize.find(VF);
|
||||||
|
assert(Scalarized != InstsToScalarize.end() &&
|
||||||
|
"VF not yet analyzed for scalarization profitability");
|
||||||
|
return !Scalarized->second.count(I) &&
|
||||||
|
llvm::all_of(I->users(), [&](User *U) {
|
||||||
|
auto *UI = cast<Instruction>(U);
|
||||||
|
return !Scalarized->second.count(UI);
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
if (isScalarAfterVectorization(I, VF)) {
|
||||||
|
VectorTy = RetTy;
|
||||||
|
// With the exception of GEPs, after scalarization there should only be one
|
||||||
|
// copy of the instruction generated in the loop. This is because the VF is
|
||||||
|
// either 1, or any instructions that need scalarizing have already been
|
||||||
|
// dealt with by the the time we get here. As a result, it means we don't
|
||||||
|
// have to multiply the instruction cost by VF.
|
||||||
|
assert(I->getOpcode() == Instruction::GetElementPtr ||
|
||||||
|
hasSingleCopyAfterVectorization(I, VF));
|
||||||
|
} else
|
||||||
|
VectorTy = ToVectorTy(RetTy, VF);
|
||||||
|
|
||||||
// TODO: We need to estimate the cost of intrinsic calls.
|
// TODO: We need to estimate the cost of intrinsic calls.
|
||||||
switch (I->getOpcode()) {
|
switch (I->getOpcode()) {
|
||||||
case Instruction::GetElementPtr:
|
case Instruction::GetElementPtr:
|
||||||
@ -7384,21 +7410,16 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
|
|||||||
Op2VK = TargetTransformInfo::OK_UniformValue;
|
Op2VK = TargetTransformInfo::OK_UniformValue;
|
||||||
|
|
||||||
SmallVector<const Value *, 4> Operands(I->operand_values());
|
SmallVector<const Value *, 4> Operands(I->operand_values());
|
||||||
unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
|
return TTI.getArithmeticInstrCost(
|
||||||
return N * TTI.getArithmeticInstrCost(
|
I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
|
||||||
I->getOpcode(), VectorTy, CostKind,
|
Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
|
||||||
TargetTransformInfo::OK_AnyValue,
|
|
||||||
Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
|
|
||||||
}
|
}
|
||||||
case Instruction::FNeg: {
|
case Instruction::FNeg: {
|
||||||
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
|
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
|
||||||
unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
|
return TTI.getArithmeticInstrCost(
|
||||||
return N * TTI.getArithmeticInstrCost(
|
I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
|
||||||
I->getOpcode(), VectorTy, CostKind,
|
TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
|
||||||
TargetTransformInfo::OK_AnyValue,
|
TargetTransformInfo::OP_None, I->getOperand(0), I);
|
||||||
TargetTransformInfo::OK_AnyValue,
|
|
||||||
TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
|
|
||||||
I->getOperand(0), I);
|
|
||||||
}
|
}
|
||||||
case Instruction::Select: {
|
case Instruction::Select: {
|
||||||
SelectInst *SI = cast<SelectInst>(I);
|
SelectInst *SI = cast<SelectInst>(I);
|
||||||
@ -7522,14 +7543,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned N;
|
return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
|
||||||
if (isScalarAfterVectorization(I, VF)) {
|
|
||||||
assert(!VF.isScalable() && "VF is assumed to be non scalable");
|
|
||||||
N = VF.getKnownMinValue();
|
|
||||||
} else
|
|
||||||
N = 1;
|
|
||||||
return N *
|
|
||||||
TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
|
|
||||||
}
|
}
|
||||||
case Instruction::Call: {
|
case Instruction::Call: {
|
||||||
bool NeedToScalarize;
|
bool NeedToScalarize;
|
||||||
@ -7544,11 +7558,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
|
|||||||
case Instruction::ExtractValue:
|
case Instruction::ExtractValue:
|
||||||
return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
|
return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
|
||||||
default:
|
default:
|
||||||
// The cost of executing VF copies of the scalar instruction. This opcode
|
// This opcode is unknown. Assume that it is the same as 'mul'.
|
||||||
// is unknown. Assume that it is the same as 'mul'.
|
return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
|
||||||
return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
|
|
||||||
Instruction::Mul, VectorTy, CostKind) +
|
|
||||||
getScalarizationOverhead(I, VF);
|
|
||||||
} // end of switch.
|
} // end of switch.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ target triple = "aarch64--linux-gnu"
|
|||||||
|
|
||||||
; CHECK-LABEL: all_scalar
|
; CHECK-LABEL: all_scalar
|
||||||
; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
|
; CHECK: LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
|
||||||
; CHECK: LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
|
; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
|
||||||
; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions
|
; CHECK: LV: Not considering vector loop of width 2 because it will not generate any vector instructions
|
||||||
;
|
;
|
||||||
define void @all_scalar(i64* %a, i64 %n) {
|
define void @all_scalar(i64* %a, i64 %n) {
|
||||||
|
Loading…
Reference in New Issue
Block a user