1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 02:33:06 +01:00

[LV] Return both fixed and scalable Max VF from computeMaxVF.

This patch introduces a new class, MaxVFCandidates, that holds the
maximum vectorization factors that have been computed for both scalable
and fixed-width vectors.

This patch is intended to be NFC for fixed-width vectors, although
considering a scalable max VF (which is disabled by default) pessimises
tail-loop elimination, since it can no longer determine if any chosen VF
(less than fixed/scalable MaxVFs) is guaranteed to handle all vector
iterations if the trip-count is known. This issue will be addressed in
a future patch.

Reviewed By: fhahn, david-arm

Differential Revision: https://reviews.llvm.org/D98721
This commit is contained in:
Sander de Smalen 2021-05-18 07:37:31 +01:00
parent 9ff115e8b2
commit 66261e9287
3 changed files with 129 additions and 62 deletions

View File

@ -200,6 +200,37 @@ struct VectorizationFactor {
}
};
/// A class that represents two vectorization factors (initialized with 0 by
/// default). One for fixed-width vectorization and one for scalable
/// vectorization. This can be used by the vectorizer to choose from a range of
/// fixed and/or scalable VFs in order to find the most cost-effective VF to
/// vectorize with.
struct FixedScalableVFPair {
ElementCount FixedVF;
ElementCount ScalableVF;
FixedScalableVFPair()
: FixedVF(ElementCount::getFixed(0)),
ScalableVF(ElementCount::getScalable(0)) {}
FixedScalableVFPair(const ElementCount &Max) : FixedScalableVFPair() {
*(Max.isScalable() ? &ScalableVF : &FixedVF) = Max;
}
FixedScalableVFPair(const ElementCount &FixedVF,
const ElementCount &ScalableVF)
: FixedVF(FixedVF), ScalableVF(ScalableVF) {
assert(!FixedVF.isScalable() && ScalableVF.isScalable() &&
"Invalid scalable properties");
}
static FixedScalableVFPair getNone() { return FixedScalableVFPair(); }
/// \return true if either fixed- or scalable VF is non-zero.
explicit operator bool() const { return FixedVF || ScalableVF; }
/// \return true if either fixed- or scalable VF is a valid vector VF.
bool hasVector() const { return FixedVF.isVector() || ScalableVF.isVector(); }
};
/// Planner drives the vectorization process after having passed
/// Legality checks.
class LoopVectorizationPlanner {

View File

@ -1229,9 +1229,10 @@ public:
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
Hints(Hints), InterleaveInfo(IAI) {}
/// \return An upper bound for the vectorization factor, or None if
/// vectorization and interleaving should be avoided up front.
Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
/// \return An upper bound for the vectorization factors (both fixed and
/// scalable). If the factors are 0, vectorization and interleaving should be
/// avoided up front.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
/// \return True if runtime checks are required for vectorization, and false
/// otherwise.
@ -1625,11 +1626,13 @@ public:
private:
unsigned NumPredStores = 0;
/// \return An upper bound for the vectorization factor, a power-of-2 larger
/// than zero. One is returned if vectorization should best be avoided due
/// to cost.
ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
ElementCount UserVF);
/// \return An upper bound for the vectorization factors for both
/// fixed and scalable vectorization, where the minimum-known number of
/// elements is a power-of-2 larger than zero. If scalable vectorization is
/// disabled or unsupported, then the scalable part will be equal to
/// ElementCount::getScalable(0).
FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
ElementCount UserVF);
/// \return the maximized element count based on the targets vector
/// registers and the loop trip-count, but limited to a maximum safe VF.
@ -5676,7 +5679,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
return MaxScalableVF;
}
ElementCount
FixedScalableVFPair
LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
ElementCount UserVF) {
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
@ -5742,22 +5745,24 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
<< " / " << WidestType << " bits.\n");
ElementCount MaxFixedVF = ElementCount::getFixed(1);
FixedScalableVFPair Result(ElementCount::getFixed(1),
ElementCount::getScalable(0));
if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
WidestType, MaxSafeFixedVF))
MaxFixedVF = MaxVF;
Result.FixedVF = MaxVF;
if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
WidestType, MaxSafeScalableVF))
// FIXME: Return scalable VF as well (to be added in future patch).
if (MaxVF.isScalable())
if (MaxVF.isScalable()) {
Result.ScalableVF = MaxVF;
LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
<< "\n");
}
return MaxFixedVF;
return Result;
}
Optional<ElementCount>
FixedScalableVFPair
LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
// TODO: It may by useful to do since it's still likely to be dynamically
@ -5766,7 +5771,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
"Not inserting runtime ptr check for divergent target",
"runtime pointer checks needed. Not enabled for divergent target",
"CantVersionLoopWithDivergentTarget", ORE, TheLoop);
return None;
return FixedScalableVFPair::getNone();
}
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
@ -5775,7 +5780,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
reportVectorizationFailure("Single iteration (non) loop",
"loop trip count is one, irrelevant for vectorization",
"SingleIterationLoop", ORE, TheLoop);
return None;
return FixedScalableVFPair::getNone();
}
switch (ScalarEpilogueStatus) {
@ -5802,7 +5807,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// Bail if runtime checks are required, which are not good when optimising
// for size.
if (runtimeChecksRequired())
return None;
return FixedScalableVFPair::getNone();
break;
}
@ -5820,7 +5825,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
return computeFeasibleMaxVF(TC, UserVF);
}
return None;
return FixedScalableVFPair::getNone();
}
// Now try the tail folding
@ -5835,26 +5840,29 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
assert(!MaxVF.isScalable() &&
"Scalable vectors do not yet support tail folding");
assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
"MaxVF must be a power of 2");
unsigned MaxVFtimesIC =
UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
// Avoid tail folding if the trip count is known to be a multiple of any VF we
// chose.
ScalarEvolution *SE = PSE.getSE();
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
const SCEV *ExitCount = SE->getAddExpr(
BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
const SCEV *Rem = SE->getURemExpr(
SE->applyLoopGuards(ExitCount, TheLoop),
SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
if (Rem->isZero()) {
// Accept MaxVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
return MaxVF;
FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
// Avoid tail folding if the trip count is known to be a multiple of any VF
// we chose.
// FIXME: The condition below pessimises the case for fixed-width vectors,
// when scalable VFs are also candidates for vectorization.
if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
ElementCount MaxFixedVF = MaxFactors.FixedVF;
assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
"MaxFixedVF must be a power of 2");
unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
: MaxFixedVF.getFixedValue();
ScalarEvolution *SE = PSE.getSE();
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
const SCEV *ExitCount = SE->getAddExpr(
BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
const SCEV *Rem = SE->getURemExpr(
SE->applyLoopGuards(ExitCount, TheLoop),
SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
if (Rem->isZero()) {
// Accept MaxFixedVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
return MaxFactors;
}
}
// If we don't know the precise trip count, or if the trip count that we
@ -5863,7 +5871,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
if (Legal->prepareToFoldTailByMasking()) {
FoldTailByMasking = true;
return MaxVF;
return MaxFactors;
}
// If there was a tail-folding hint/switch, but we can't fold the tail by
@ -5872,12 +5880,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
"scalar epilogue instead.\n");
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
return MaxVF;
return MaxFactors;
}
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
return None;
return FixedScalableVFPair::getNone();
}
if (TC == 0) {
@ -5885,7 +5893,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
"Unable to calculate the loop count due to complex control flow",
"unable to calculate the loop count due to complex control flow",
"UnknownLoopCountComplexCFG", ORE, TheLoop);
return None;
return FixedScalableVFPair::getNone();
}
reportVectorizationFailure(
@ -5894,7 +5902,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
"Enable vectorization of this loop with '#pragma clang loop "
"vectorize(enable)' when compiling with -Os/-Oz",
"NoTailLoopWithOptForSize", ORE, TheLoop);
return None;
return FixedScalableVFPair::getNone();
}
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
@ -7928,8 +7936,8 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
Optional<VectorizationFactor>
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
return None;
// Invalidate interleave groups if all blocks of loop will be predicated.
@ -7946,29 +7954,24 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
CM.invalidateCostModelingDecisions();
}
ElementCount MaxVF = MaybeMaxVF.getValue();
assert(MaxVF.isNonZero() && "MaxVF is zero.");
bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
if (!UserVF.isZero() &&
(UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
// FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
// VFs here, this should be reverted to only use legal UserVFs once the
// loop below supports scalable VFs.
ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
ElementCount MaxUserVF =
UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
if (!UserVF.isZero() && UserVFIsLegal) {
LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
<< " VF " << VF << ".\n");
assert(isPowerOf2_32(VF.getKnownMinValue()) &&
<< " VF " << UserVF << ".\n");
assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
"VF needs to be a power of two");
// Collect the instructions (and their associated costs) that will be more
// profitable to scalarize.
CM.selectUserVectorizationFactor(VF);
CM.selectUserVectorizationFactor(UserVF);
CM.collectInLoopReductions();
buildVPlansWithVPRecipes(VF, VF);
buildVPlansWithVPRecipes({UserVF}, {UserVF});
LLVM_DEBUG(printPlans(dbgs()));
return {{VF, 0}};
return {{UserVF, 0}};
}
ElementCount MaxVF = MaxFactors.FixedVF;
assert(!MaxVF.isScalable() &&
"Scalable vectors not yet supported beyond this point");
@ -7987,7 +7990,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
LLVM_DEBUG(printPlans(dbgs()));
if (MaxVF.isScalar())
if (!MaxFactors.hasVector())
return VectorizationFactor::Disabled();
// Select the optimal vectorization factor.

View File

@ -0,0 +1,33 @@
; RUN: opt -loop-vectorize -force-target-instruction-cost=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s 2>&1 | FileCheck %s
; This test currently fails when the LV calculates a maximums safe
; distance for scalable vectors, because the code to eliminate the tail is
; pessimistic when scalable vectors are considered. This will be addressed
; in a future patch, at which point we should be able to un-XFAIL the
; test. The expected output is to vectorize this loop without predication
; (and thus have unpredicated vector store).
; XFAIL: *
; CHECK: store <4 x i32>
target triple = "aarch64"
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
define void @f1(i32* %A) #0 {
entry:
br label %for.body
for.body:
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %iv
store i32 1, i32* %arrayidx, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp ne i64 %iv.next, 1024
br i1 %exitcond, label %for.body, label %exit
exit:
ret void
}
attributes #0 = { "target-features"="+sve" }