1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 11:02:59 +02:00

[SLP]Try to vectorize tiny trees with shuffled gathers of extractelements.

If we gather extract elements and they actually are just shuffles, it
might be profitable to vectorize them even if the tree is tiny.

Differential Revision: https://reviews.llvm.org/D101460
This commit is contained in:
Alexey Bataev 2021-04-28 07:27:13 -07:00
parent 00169a8661
commit 2a555e9e81
2 changed files with 19 additions and 14 deletions

View File

@ -4140,13 +4140,18 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
// Handle splat and all-constants stores. Also try to vectorize tiny trees
// with the second gather nodes if they have less scalar operands rather than
// the initial tree element (may be profitable to shuffle the second gather).
// the initial tree element (may be profitable to shuffle the second gather)
// or they are extractelements, which form shuffle.
SmallVector<int> Mask;
if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
(allConstant(VectorizableTree[1]->Scalars) ||
isSplat(VectorizableTree[1]->Scalars) ||
(VectorizableTree[1]->State == TreeEntry::NeedToGather &&
VectorizableTree[1]->Scalars.size() <
VectorizableTree[0]->Scalars.size())))
VectorizableTree[0]->Scalars.size()) ||
(VectorizableTree[1]->State == TreeEntry::NeedToGather &&
VectorizableTree[1]->getOpcode() == Instruction::ExtractElement &&
isShuffle(VectorizableTree[1]->Scalars, Mask))))
return true;
// Gathering cost would be too much for tiny trees.
@ -6088,6 +6093,9 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
break;
case Instruction::ZExt:
case Instruction::SExt:
if (isa<ExtractElementInst>(I->getOperand(0)) ||
isa<InsertElementInst>(I->getOperand(0)))
return false;
break;
// We can demote certain binary operations if we can demote both of their

View File

@ -12,21 +12,18 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, i64* %p) {
; CHECK-NEXT: [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32>
; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32>
; CHECK-NEXT: [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]]
; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0
; CHECK-NEXT: [[S0:%.*]] = sext i32 [[E0]] to i64
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[S0]]
; CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64>
; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0
; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[TMP1]]
; CHECK-NEXT: [[LOAD0:%.*]] = load i64, i64* [[GEP0]], align 4
; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1
; CHECK-NEXT: [[S1:%.*]] = sext i32 [[E1]] to i64
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S1]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1
; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP2]]
; CHECK-NEXT: [[LOAD1:%.*]] = load i64, i64* [[GEP1]], align 4
; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2
; CHECK-NEXT: [[S2:%.*]] = sext i32 [[E2]] to i64
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S2]]
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2
; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP3]]
; CHECK-NEXT: [[LOAD2:%.*]] = load i64, i64* [[GEP2]], align 4
; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3
; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64
; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S3]]
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3
; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP4]]
; CHECK-NEXT: [[LOAD3:%.*]] = load i64, i64* [[GEP3]], align 4
; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]])
; CHECK-NEXT: ret void