mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 02:33:06 +01:00
[LV] Return both fixed and scalable Max VF from computeMaxVF.
This patch introduces a new class, MaxVFCandidates, that holds the maximum vectorization factors that have been computed for both scalable and fixed-width vectors. This patch is intended to be NFC for fixed-width vectors, although considering a scalable max VF (which is disabled by default) pessimises tail-loop elimination, since it can no longer determine if any chosen VF (less than fixed/scalable MaxVFs) is guaranteed to handle all vector iterations if the trip-count is known. This issue will be addressed in a future patch. Reviewed By: fhahn, david-arm Differential Revision: https://reviews.llvm.org/D98721
This commit is contained in:
parent
9ff115e8b2
commit
66261e9287
@ -200,6 +200,37 @@ struct VectorizationFactor {
|
||||
}
|
||||
};
|
||||
|
||||
/// A class that represents two vectorization factors (initialized with 0 by
|
||||
/// default). One for fixed-width vectorization and one for scalable
|
||||
/// vectorization. This can be used by the vectorizer to choose from a range of
|
||||
/// fixed and/or scalable VFs in order to find the most cost-effective VF to
|
||||
/// vectorize with.
|
||||
struct FixedScalableVFPair {
|
||||
ElementCount FixedVF;
|
||||
ElementCount ScalableVF;
|
||||
|
||||
FixedScalableVFPair()
|
||||
: FixedVF(ElementCount::getFixed(0)),
|
||||
ScalableVF(ElementCount::getScalable(0)) {}
|
||||
FixedScalableVFPair(const ElementCount &Max) : FixedScalableVFPair() {
|
||||
*(Max.isScalable() ? &ScalableVF : &FixedVF) = Max;
|
||||
}
|
||||
FixedScalableVFPair(const ElementCount &FixedVF,
|
||||
const ElementCount &ScalableVF)
|
||||
: FixedVF(FixedVF), ScalableVF(ScalableVF) {
|
||||
assert(!FixedVF.isScalable() && ScalableVF.isScalable() &&
|
||||
"Invalid scalable properties");
|
||||
}
|
||||
|
||||
static FixedScalableVFPair getNone() { return FixedScalableVFPair(); }
|
||||
|
||||
/// \return true if either fixed- or scalable VF is non-zero.
|
||||
explicit operator bool() const { return FixedVF || ScalableVF; }
|
||||
|
||||
/// \return true if either fixed- or scalable VF is a valid vector VF.
|
||||
bool hasVector() const { return FixedVF.isVector() || ScalableVF.isVector(); }
|
||||
};
|
||||
|
||||
/// Planner drives the vectorization process after having passed
|
||||
/// Legality checks.
|
||||
class LoopVectorizationPlanner {
|
||||
|
@ -1229,9 +1229,10 @@ public:
|
||||
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
|
||||
Hints(Hints), InterleaveInfo(IAI) {}
|
||||
|
||||
/// \return An upper bound for the vectorization factor, or None if
|
||||
/// vectorization and interleaving should be avoided up front.
|
||||
Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
|
||||
/// \return An upper bound for the vectorization factors (both fixed and
|
||||
/// scalable). If the factors are 0, vectorization and interleaving should be
|
||||
/// avoided up front.
|
||||
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
|
||||
|
||||
/// \return True if runtime checks are required for vectorization, and false
|
||||
/// otherwise.
|
||||
@ -1625,11 +1626,13 @@ public:
|
||||
private:
|
||||
unsigned NumPredStores = 0;
|
||||
|
||||
/// \return An upper bound for the vectorization factor, a power-of-2 larger
|
||||
/// than zero. One is returned if vectorization should best be avoided due
|
||||
/// to cost.
|
||||
ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
|
||||
ElementCount UserVF);
|
||||
/// \return An upper bound for the vectorization factors for both
|
||||
/// fixed and scalable vectorization, where the minimum-known number of
|
||||
/// elements is a power-of-2 larger than zero. If scalable vectorization is
|
||||
/// disabled or unsupported, then the scalable part will be equal to
|
||||
/// ElementCount::getScalable(0).
|
||||
FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
|
||||
ElementCount UserVF);
|
||||
|
||||
/// \return the maximized element count based on the targets vector
|
||||
/// registers and the loop trip-count, but limited to a maximum safe VF.
|
||||
@ -5676,7 +5679,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
|
||||
return MaxScalableVF;
|
||||
}
|
||||
|
||||
ElementCount
|
||||
FixedScalableVFPair
|
||||
LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
|
||||
ElementCount UserVF) {
|
||||
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
|
||||
@ -5742,22 +5745,24 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
|
||||
LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
|
||||
<< " / " << WidestType << " bits.\n");
|
||||
|
||||
ElementCount MaxFixedVF = ElementCount::getFixed(1);
|
||||
FixedScalableVFPair Result(ElementCount::getFixed(1),
|
||||
ElementCount::getScalable(0));
|
||||
if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
|
||||
WidestType, MaxSafeFixedVF))
|
||||
MaxFixedVF = MaxVF;
|
||||
Result.FixedVF = MaxVF;
|
||||
|
||||
if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
|
||||
WidestType, MaxSafeScalableVF))
|
||||
// FIXME: Return scalable VF as well (to be added in future patch).
|
||||
if (MaxVF.isScalable())
|
||||
if (MaxVF.isScalable()) {
|
||||
Result.ScalableVF = MaxVF;
|
||||
LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
|
||||
<< "\n");
|
||||
}
|
||||
|
||||
return MaxFixedVF;
|
||||
return Result;
|
||||
}
|
||||
|
||||
Optional<ElementCount>
|
||||
FixedScalableVFPair
|
||||
LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
|
||||
if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
|
||||
// TODO: It may by useful to do since it's still likely to be dynamically
|
||||
@ -5766,7 +5771,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
|
||||
"Not inserting runtime ptr check for divergent target",
|
||||
"runtime pointer checks needed. Not enabled for divergent target",
|
||||
"CantVersionLoopWithDivergentTarget", ORE, TheLoop);
|
||||
return None;
|
||||
return FixedScalableVFPair::getNone();
|
||||
}
|
||||
|
||||
unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
|
||||
@ -5775,7 +5780,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
|
||||
reportVectorizationFailure("Single iteration (non) loop",
|
||||
"loop trip count is one, irrelevant for vectorization",
|
||||
"SingleIterationLoop", ORE, TheLoop);
|
||||
return None;
|
||||
return FixedScalableVFPair::getNone();
|
||||
}
|
||||
|
||||
switch (ScalarEpilogueStatus) {
|
||||
@ -5802,7 +5807,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
|
||||
// Bail if runtime checks are required, which are not good when optimising
|
||||
// for size.
|
||||
if (runtimeChecksRequired())
|
||||
return None;
|
||||
return FixedScalableVFPair::getNone();
|
||||
|
||||
break;
|
||||
}
|
||||
@ -5820,7 +5825,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
|
||||
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
|
||||
return computeFeasibleMaxVF(TC, UserVF);
|
||||
}
|
||||
return None;
|
||||
return FixedScalableVFPair::getNone();
|
||||
}
|
||||
|
||||
// Now try the tail folding
|
||||
@ -5835,26 +5840,29 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
|
||||
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
|
||||
}
|
||||
|
||||
ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
|
||||
assert(!MaxVF.isScalable() &&
|
||||
"Scalable vectors do not yet support tail folding");
|
||||
assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
|
||||
"MaxVF must be a power of 2");
|
||||
unsigned MaxVFtimesIC =
|
||||
UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
|
||||
// Avoid tail folding if the trip count is known to be a multiple of any VF we
|
||||
// chose.
|
||||
ScalarEvolution *SE = PSE.getSE();
|
||||
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
|
||||
const SCEV *ExitCount = SE->getAddExpr(
|
||||
BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
|
||||
const SCEV *Rem = SE->getURemExpr(
|
||||
SE->applyLoopGuards(ExitCount, TheLoop),
|
||||
SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
|
||||
if (Rem->isZero()) {
|
||||
// Accept MaxVF if we do not have a tail.
|
||||
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
|
||||
return MaxVF;
|
||||
FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
|
||||
// Avoid tail folding if the trip count is known to be a multiple of any VF
|
||||
// we chose.
|
||||
// FIXME: The condition below pessimises the case for fixed-width vectors,
|
||||
// when scalable VFs are also candidates for vectorization.
|
||||
if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
|
||||
ElementCount MaxFixedVF = MaxFactors.FixedVF;
|
||||
assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
|
||||
"MaxFixedVF must be a power of 2");
|
||||
unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
|
||||
: MaxFixedVF.getFixedValue();
|
||||
ScalarEvolution *SE = PSE.getSE();
|
||||
const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
|
||||
const SCEV *ExitCount = SE->getAddExpr(
|
||||
BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
|
||||
const SCEV *Rem = SE->getURemExpr(
|
||||
SE->applyLoopGuards(ExitCount, TheLoop),
|
||||
SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
|
||||
if (Rem->isZero()) {
|
||||
// Accept MaxFixedVF if we do not have a tail.
|
||||
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
|
||||
return MaxFactors;
|
||||
}
|
||||
}
|
||||
|
||||
// If we don't know the precise trip count, or if the trip count that we
|
||||
@ -5863,7 +5871,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
|
||||
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
|
||||
if (Legal->prepareToFoldTailByMasking()) {
|
||||
FoldTailByMasking = true;
|
||||
return MaxVF;
|
||||
return MaxFactors;
|
||||
}
|
||||
|
||||
// If there was a tail-folding hint/switch, but we can't fold the tail by
|
||||
@ -5872,12 +5880,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
|
||||
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
|
||||
"scalar epilogue instead.\n");
|
||||
ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
|
||||
return MaxVF;
|
||||
return MaxFactors;
|
||||
}
|
||||
|
||||
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
|
||||
LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
|
||||
return None;
|
||||
return FixedScalableVFPair::getNone();
|
||||
}
|
||||
|
||||
if (TC == 0) {
|
||||
@ -5885,7 +5893,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
|
||||
"Unable to calculate the loop count due to complex control flow",
|
||||
"unable to calculate the loop count due to complex control flow",
|
||||
"UnknownLoopCountComplexCFG", ORE, TheLoop);
|
||||
return None;
|
||||
return FixedScalableVFPair::getNone();
|
||||
}
|
||||
|
||||
reportVectorizationFailure(
|
||||
@ -5894,7 +5902,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
|
||||
"Enable vectorization of this loop with '#pragma clang loop "
|
||||
"vectorize(enable)' when compiling with -Os/-Oz",
|
||||
"NoTailLoopWithOptForSize", ORE, TheLoop);
|
||||
return None;
|
||||
return FixedScalableVFPair::getNone();
|
||||
}
|
||||
|
||||
ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
|
||||
@ -7928,8 +7936,8 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
|
||||
Optional<VectorizationFactor>
|
||||
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
|
||||
assert(OrigLoop->isInnermost() && "Inner loop expected.");
|
||||
Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
|
||||
if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
|
||||
FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
|
||||
if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
|
||||
return None;
|
||||
|
||||
// Invalidate interleave groups if all blocks of loop will be predicated.
|
||||
@ -7946,29 +7954,24 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
|
||||
CM.invalidateCostModelingDecisions();
|
||||
}
|
||||
|
||||
ElementCount MaxVF = MaybeMaxVF.getValue();
|
||||
assert(MaxVF.isNonZero() && "MaxVF is zero.");
|
||||
|
||||
bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
|
||||
if (!UserVF.isZero() &&
|
||||
(UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
|
||||
// FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
|
||||
// VFs here, this should be reverted to only use legal UserVFs once the
|
||||
// loop below supports scalable VFs.
|
||||
ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
|
||||
ElementCount MaxUserVF =
|
||||
UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
|
||||
bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
|
||||
if (!UserVF.isZero() && UserVFIsLegal) {
|
||||
LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
|
||||
<< " VF " << VF << ".\n");
|
||||
assert(isPowerOf2_32(VF.getKnownMinValue()) &&
|
||||
<< " VF " << UserVF << ".\n");
|
||||
assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
|
||||
"VF needs to be a power of two");
|
||||
// Collect the instructions (and their associated costs) that will be more
|
||||
// profitable to scalarize.
|
||||
CM.selectUserVectorizationFactor(VF);
|
||||
CM.selectUserVectorizationFactor(UserVF);
|
||||
CM.collectInLoopReductions();
|
||||
buildVPlansWithVPRecipes(VF, VF);
|
||||
buildVPlansWithVPRecipes({UserVF}, {UserVF});
|
||||
LLVM_DEBUG(printPlans(dbgs()));
|
||||
return {{VF, 0}};
|
||||
return {{UserVF, 0}};
|
||||
}
|
||||
|
||||
ElementCount MaxVF = MaxFactors.FixedVF;
|
||||
assert(!MaxVF.isScalable() &&
|
||||
"Scalable vectors not yet supported beyond this point");
|
||||
|
||||
@ -7987,7 +7990,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
|
||||
|
||||
buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
|
||||
LLVM_DEBUG(printPlans(dbgs()));
|
||||
if (MaxVF.isScalar())
|
||||
if (!MaxFactors.hasVector())
|
||||
return VectorizationFactor::Disabled();
|
||||
|
||||
// Select the optimal vectorization factor.
|
||||
|
@ -0,0 +1,33 @@
|
||||
; RUN: opt -loop-vectorize -force-target-instruction-cost=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s 2>&1 | FileCheck %s
|
||||
|
||||
; This test currently fails when the LV calculates a maximums safe
|
||||
; distance for scalable vectors, because the code to eliminate the tail is
|
||||
; pessimistic when scalable vectors are considered. This will be addressed
|
||||
; in a future patch, at which point we should be able to un-XFAIL the
|
||||
; test. The expected output is to vectorize this loop without predication
|
||||
; (and thus have unpredicated vector store).
|
||||
; XFAIL: *
|
||||
|
||||
; CHECK: store <4 x i32>
|
||||
|
||||
target triple = "aarch64"
|
||||
target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
|
||||
|
||||
|
||||
define void @f1(i32* %A) #0 {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %iv
|
||||
store i32 1, i32* %arrayidx, align 4
|
||||
%iv.next = add nuw nsw i64 %iv, 1
|
||||
%exitcond = icmp ne i64 %iv.next, 1024
|
||||
br i1 %exitcond, label %for.body, label %exit
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "target-features"="+sve" }
|
Loading…
Reference in New Issue
Block a user