1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 19:12:56 +02:00

[SLP] Fix vectorization for tree with trunc to minimum required bit width.

Summary:
If the vectorized tree has truncate to minimum required bit width and
the vector type of the cast operation after the truncation is the same
as the vector type of the cast operands, count cost of the vector cast
operation as 0, because this cast will be later removed.
Also, if the vectorization tree root operations are integer cast operations, do not consider them as candidates for truncation. It will just create extra number of the same vector/scalar operations, which will be removed by instcombiner.

Reviewers: RKSimon, spatel, mkuper, hfinkel, mssimpso

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D41948

llvm-svn: 322946
This commit is contained in:
Alexey Bataev 2018-01-19 14:40:13 +00:00
parent 90b39019b6
commit 370d1329a7
3 changed files with 33 additions and 21 deletions

View File

@ -2065,7 +2065,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
VL0->getType(), SrcTy, VL0);
VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
int VecCost = 0;
// Check if the values are candidates to demote.
if (!MinBWs.count(VL0) || VecTy != SrcVecTy)
VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
return VecCost - ScalarCost;
}
case Instruction::FCmp:
@ -4014,9 +4017,24 @@ void BoUpSLP::computeMinimumValueSizes() {
// additional roots that require investigating in Roots.
SmallVector<Value *, 32> ToDemote;
SmallVector<Value *, 4> Roots;
for (auto *Root : TreeRoot)
for (auto *Root : TreeRoot) {
// Do not include top zext/sext/trunc operations to those to be demoted, it
// produces noise cast<vect>, trunc <vect>, exctract <vect>, cast <extract>
// sequence.
if (isa<Constant>(Root))
continue;
auto *I = dyn_cast<Instruction>(Root);
if (!I || !I->hasOneUse() || !Expr.count(I))
return;
if (isa<ZExtInst>(I) || isa<SExtInst>(I))
continue;
if (auto *TI = dyn_cast<TruncInst>(I)) {
Roots.push_back(TI->getOperand(0));
continue;
}
if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
return;
}
// The maximum bit width required to represent all the values that can be
// demoted without loss of precision. It would be safe to truncate the roots

View File

@ -16,13 +16,10 @@ define { i64, i64 } @patatino(double %arg) {
; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
; CHECK-NEXT: [[TMP8:%.*]] = fptosi <2 x double> [[TMP7]] to <2 x i32>
; CHECK-NEXT: [[TMP9:%.*]] = sext <2 x i32> [[TMP8]] to <2 x i64>
; CHECK-NEXT: [[TMP10:%.*]] = trunc <2 x i64> [[TMP9]] to <2 x i32>
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
; CHECK-NEXT: [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP12]], 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1
; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP14]], 1
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0
; CHECK-NEXT: [[TMP16:%.*]] = insertvalue { i64, i64 } undef, i64 [[TMP10]], 0
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1
; CHECK-NEXT: [[TMP17:%.*]] = insertvalue { i64, i64 } [[TMP16]], i64 [[TMP11]], 1
; CHECK-NEXT: ret { i64, i64 } [[TMP17]]
;
bb:

View File

@ -4,18 +4,15 @@
define <4 x i32> @sign_extend_v_v(<4 x i16> %lhs) {
; CHECK-LABEL: @sign_extend_v_v(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x i16> [[LHS:%.*]], i32 0
; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[VECEXT]] to i32
; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[CONV]], i32 0
; CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i16> [[LHS]], i32 1
; CHECK-NEXT: [[CONV2:%.*]] = sext i16 [[VECEXT1]] to i32
; CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[CONV2]], i32 1
; CHECK-NEXT: [[VECEXT4:%.*]] = extractelement <4 x i16> [[LHS]], i32 2
; CHECK-NEXT: [[CONV5:%.*]] = sext i16 [[VECEXT4]] to i32
; CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT3]], i32 [[CONV5]], i32 2
; CHECK-NEXT: [[VECEXT7:%.*]] = extractelement <4 x i16> [[LHS]], i32 3
; CHECK-NEXT: [[CONV8:%.*]] = sext i16 [[VECEXT7]] to i32
; CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[CONV8]], i32 3
; CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i16> [[LHS:%.*]] to <4 x i32>
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 0
; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP0]], i32 1
; CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[TMP2]], i32 1
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
; CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT3]], i32 [[TMP3]], i32 2
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
; CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <4 x i32> [[VECINIT6]], i32 [[TMP4]], i32 3
; CHECK-NEXT: ret <4 x i32> [[VECINIT9]]
;
entry: