mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 02:33:06 +01:00
[SLP]Fix costs calculations.
Need to fix several cost-related problems. The final type may be defined incorrectly because of to early definition (we may end up with the wider type), the CommonCost should not be redefined in ExtractElements cost related calculations and the shuffle of the final insertelements vectors should be calculated as a cost of single vector permutations + costs of two vector permutations for other n-1 incoming vectors. Differential Revision: https://reviews.llvm.org/D106578
This commit is contained in:
parent
fc75021aa7
commit
d88801e0e6
@ -3654,7 +3654,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
|
||||
else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
|
||||
ScalarTy = IE->getOperand(1)->getType();
|
||||
auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
|
||||
auto *FinalVecTy = VecTy;
|
||||
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
|
||||
|
||||
// If we have computed a smaller type for the expression, update VecTy so
|
||||
@ -3662,6 +3661,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
|
||||
if (MinBWs.count(VL[0]))
|
||||
VecTy = FixedVectorType::get(
|
||||
IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
|
||||
auto *FinalVecTy = VecTy;
|
||||
|
||||
unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
|
||||
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
|
||||
@ -3838,7 +3838,6 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
|
||||
case Instruction::ExtractElement: {
|
||||
// The common cost of removal ExtractElement/ExtractValue instructions +
|
||||
// the cost of shuffles, if required to resuffle the original vector.
|
||||
InstructionCost CommonCost = 0;
|
||||
if (NeedToShuffleReuses) {
|
||||
unsigned Idx = 0;
|
||||
for (unsigned I : E->ReuseShuffleIndices) {
|
||||
@ -4133,7 +4132,7 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E,
|
||||
commonAlignment(CommonAlignment, cast<LoadInst>(V)->getAlign());
|
||||
VecLdCost = TTI->getGatherScatterOpCost(
|
||||
Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
|
||||
/*VariableMask=*/false, Alignment, CostKind, VL0);
|
||||
/*VariableMask=*/false, CommonAlignment, CostKind, VL0);
|
||||
}
|
||||
LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecLdCost, ScalarLdCost));
|
||||
return CommonCost + VecLdCost - ScalarLdCost;
|
||||
@ -4471,7 +4470,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
|
||||
|
||||
SmallPtrSet<Value *, 16> ExtractCostCalculated;
|
||||
InstructionCost ExtractCost = 0;
|
||||
SmallBitVector IsIdentity;
|
||||
SmallVector<unsigned> VF;
|
||||
SmallVector<SmallVector<int>> ShuffleMask;
|
||||
SmallVector<Value *> FirstUsers;
|
||||
@ -4528,15 +4526,12 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
|
||||
ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
|
||||
FirstUsers.push_back(EU.User);
|
||||
DemandedElts.push_back(APInt::getNullValue(VF.back()));
|
||||
IsIdentity.push_back(true);
|
||||
VecId = FirstUsers.size() - 1;
|
||||
} else {
|
||||
VecId = std::distance(FirstUsers.begin(), It);
|
||||
}
|
||||
int Idx = *InsertIdx;
|
||||
ShuffleMask[VecId][Idx] = EU.Lane;
|
||||
IsIdentity.set(IsIdentity.test(VecId) &
|
||||
(EU.Lane == Idx || EU.Lane == UndefMaskElem));
|
||||
DemandedElts[VecId].setBit(Idx);
|
||||
}
|
||||
}
|
||||
@ -4562,7 +4557,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
|
||||
InstructionCost SpillCost = getSpillCost();
|
||||
Cost += SpillCost + ExtractCost;
|
||||
for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
|
||||
if (!IsIdentity.test(I)) {
|
||||
// For the very first element - simple shuffle of the source vector.
|
||||
if (I == 0 && !ShuffleVectorInst::isIdentityMask(ShuffleMask[I])) {
|
||||
InstructionCost C = TTI->getShuffleCost(
|
||||
TTI::SK_PermuteSingleSrc,
|
||||
cast<FixedVectorType>(FirstUsers[I]->getType()), ShuffleMask[I]);
|
||||
@ -4571,10 +4567,15 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
|
||||
<< *VectorizableTree.front()->Scalars.front() << ".\n"
|
||||
<< "SLP: Current total cost = " << Cost << "\n");
|
||||
Cost += C;
|
||||
continue;
|
||||
}
|
||||
// Other elements - permutation of 2 vectors (the initial one and the next
|
||||
// Ith incoming vector).
|
||||
unsigned VF = ShuffleMask[I].size();
|
||||
for (int &Mask : ShuffleMask[I])
|
||||
Mask = (Mask == UndefMaskElem ? 0 : VF) + Mask;
|
||||
for (unsigned Idx = 0; Idx < VF; ++Idx) {
|
||||
int &Mask = ShuffleMask[I][Idx];
|
||||
Mask = Mask == UndefMaskElem ? Idx : VF + Mask;
|
||||
}
|
||||
InstructionCost C = TTI->getShuffleCost(
|
||||
TTI::SK_PermuteTwoSrc, cast<FixedVectorType>(FirstUsers[I]->getType()),
|
||||
ShuffleMask[I]);
|
||||
|
@ -40,22 +40,22 @@ define void @test(i32* nocapture %t2) {
|
||||
; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819
|
||||
; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069
|
||||
; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T40]], i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T15]], i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T47]], i32 0
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T9]], i32 1
|
||||
; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
|
||||
; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
|
||||
; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> poison, i32 [[TMP6]], i32 0
|
||||
; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
|
||||
; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP7]], i32 1
|
||||
; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[T691:%.*]] = shufflevector <8 x i32> [[T67]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T691]], i32 [[T50]], i32 5
|
||||
; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
|
||||
; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP8]], i32 7
|
||||
; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
|
||||
; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
|
||||
; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>*
|
||||
; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4
|
||||
|
@ -40,22 +40,22 @@ define void @test(i32* nocapture %t2) {
|
||||
; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819
|
||||
; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069
|
||||
; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T40]], i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T15]], i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T47]], i32 0
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T9]], i32 1
|
||||
; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]]
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]]
|
||||
; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]]
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
|
||||
; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0
|
||||
; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
|
||||
; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP7]], i32 1
|
||||
; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[T691:%.*]] = shufflevector <8 x i32> [[T67]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 5, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T691]], i32 [[T50]], i32 5
|
||||
; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
|
||||
; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[TMP8]], i32 7
|
||||
; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
|
||||
; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6
|
||||
; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7
|
||||
; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
|
||||
; CHECK-NEXT: [[T79:%.*]] = bitcast i32* [[T2]] to <8 x i32>*
|
||||
; CHECK-NEXT: store <8 x i32> [[T76]], <8 x i32>* [[T79]], align 4
|
||||
|
Loading…
Reference in New Issue
Block a user