From 2a555e9e812157eca68e9e97db61dad8a6502466 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 28 Apr 2021 07:27:13 -0700 Subject: [PATCH] [SLP]Try to vectorize tiny trees with shuffled gathers of extractelements. If we gather extract elements and they actually are just shuffles, it might be profitable to vectorize them even if the tree is tiny. Differential Revision: https://reviews.llvm.org/D101460 --- lib/Transforms/Vectorize/SLPVectorizer.cpp | 12 +++++++++-- .../SLPVectorizer/AArch64/ext-trunc.ll | 21 ++++++++----------- 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index e165c424bab..06276869687 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4140,13 +4140,18 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const { // Handle splat and all-constants stores. Also try to vectorize tiny trees // with the second gather nodes if they have less scalar operands rather than - // the initial tree element (may be profitable to shuffle the second gather). + // the initial tree element (may be profitable to shuffle the second gather) + // or they are extractelements, which form shuffle. + SmallVector Mask; if (VectorizableTree[0]->State == TreeEntry::Vectorize && (allConstant(VectorizableTree[1]->Scalars) || isSplat(VectorizableTree[1]->Scalars) || (VectorizableTree[1]->State == TreeEntry::NeedToGather && VectorizableTree[1]->Scalars.size() < - VectorizableTree[0]->Scalars.size()))) + VectorizableTree[0]->Scalars.size()) || + (VectorizableTree[1]->State == TreeEntry::NeedToGather && + VectorizableTree[1]->getOpcode() == Instruction::ExtractElement && + isShuffle(VectorizableTree[1]->Scalars, Mask)))) return true; // Gathering cost would be too much for tiny trees. @@ -6088,6 +6093,9 @@ static bool collectValuesToDemote(Value *V, SmallPtrSetImpl &Expr, break; case Instruction::ZExt: case Instruction::SExt: + if (isa(I->getOperand(0)) || + isa(I->getOperand(0))) + return false; break; // We can demote certain binary operations if we can demote both of their diff --git a/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll index 16e7549a6a2..e402bd8bf54 100644 --- a/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll +++ b/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll @@ -12,21 +12,18 @@ define void @test1(<4 x i16> %a, <4 x i16> %b, i64* %p) { ; CHECK-NEXT: [[Z0:%.*]] = zext <4 x i16> [[A:%.*]] to <4 x i32> ; CHECK-NEXT: [[Z1:%.*]] = zext <4 x i16> [[B:%.*]] to <4 x i32> ; CHECK-NEXT: [[SUB0:%.*]] = sub <4 x i32> [[Z0]], [[Z1]] -; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[SUB0]], i32 0 -; CHECK-NEXT: [[S0:%.*]] = sext i32 [[E0]] to i64 -; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[S0]] +; CHECK-NEXT: [[TMP0:%.*]] = sext <4 x i32> [[SUB0]] to <4 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[LOAD0:%.*]] = load i64, i64* [[GEP0]], align 4 -; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x i32> [[SUB0]], i32 1 -; CHECK-NEXT: [[S1:%.*]] = sext i32 [[E1]] to i64 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S1]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP2]] ; CHECK-NEXT: [[LOAD1:%.*]] = load i64, i64* [[GEP1]], align 4 -; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2 -; CHECK-NEXT: [[S2:%.*]] = sext i32 [[E2]] to i64 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S2]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i64> [[TMP0]], i32 2 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP3]] ; CHECK-NEXT: [[LOAD2:%.*]] = load i64, i64* [[GEP2]], align 4 -; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3 -; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[S3]] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP0]], i32 3 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP4]] ; CHECK-NEXT: [[LOAD3:%.*]] = load i64, i64* [[GEP3]], align 4 ; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]]) ; CHECK-NEXT: ret void