[LV] Return both fixed and scalable Max VF from computeMaxVF.

This patch introduces a new class, MaxVFCandidates, that holds the maximum vectorization factors that have been computed for both scalable and fixed-width vectors. This patch is intended to be NFC for fixed-width vectors, although considering a scalable max VF (which is disabled by default) pessimises tail-loop elimination, since it can no longer determine if any chosen VF (less than fixed/scalable MaxVFs) is guaranteed to handle all vector iterations if the trip-count is known. This issue will be addressed in a future patch. Reviewed By: fhahn, david-arm Differential Revision: https://reviews.llvm.org/D98721
2024-11-22 02:33:06 +01:00 · 2021-05-18 07:37:31 +01:00 · 2021-05-18 07:37:31 +01:00 · 66261e9287
commit 66261e9287
parent 9ff115e8b2
3 changed files with 129 additions and 62 deletions
--- a/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@ -200,6 +200,37 @@ struct VectorizationFactor {
  }
 };

+/// A class that represents two vectorization factors (initialized with 0 by
+/// default). One for fixed-width vectorization and one for scalable
+/// vectorization. This can be used by the vectorizer to choose from a range of
+/// fixed and/or scalable VFs in order to find the most cost-effective VF to
+/// vectorize with.
+struct FixedScalableVFPair {
+  ElementCount FixedVF;
+  ElementCount ScalableVF;
+
+  FixedScalableVFPair()
+      : FixedVF(ElementCount::getFixed(0)),
+        ScalableVF(ElementCount::getScalable(0)) {}
+  FixedScalableVFPair(const ElementCount &Max) : FixedScalableVFPair() {
+    *(Max.isScalable() ? &ScalableVF : &FixedVF) = Max;
+  }
+  FixedScalableVFPair(const ElementCount &FixedVF,
+                      const ElementCount &ScalableVF)
+      : FixedVF(FixedVF), ScalableVF(ScalableVF) {
+    assert(!FixedVF.isScalable() && ScalableVF.isScalable() &&
+           "Invalid scalable properties");
+  }
+
+  static FixedScalableVFPair getNone() { return FixedScalableVFPair(); }
+
+  /// \return true if either fixed- or scalable VF is non-zero.
+  explicit operator bool() const { return FixedVF || ScalableVF; }
+
+  /// \return true if either fixed- or scalable VF is a valid vector VF.
+  bool hasVector() const { return FixedVF.isVector() || ScalableVF.isVector(); }
+};
+
 /// Planner drives the vectorization process after having passed
 /// Legality checks.
 class LoopVectorizationPlanner {
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -1229,9 +1229,10 @@ public:
        TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
        Hints(Hints), InterleaveInfo(IAI) {}

-  /// \return An upper bound for the vectorization factor, or None if
-  /// vectorization and interleaving should be avoided up front.
-  Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
+  /// \return An upper bound for the vectorization factors (both fixed and
+  /// scalable). If the factors are 0, vectorization and interleaving should be
+  /// avoided up front.
+  FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);

  /// \return True if runtime checks are required for vectorization, and false
  /// otherwise.
@ -1625,11 +1626,13 @@ public:
 private:
  unsigned NumPredStores = 0;

-  /// \return An upper bound for the vectorization factor, a power-of-2 larger
-  /// than zero. One is returned if vectorization should best be avoided due
-  /// to cost.
-  ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
-                                    ElementCount UserVF);
+  /// \return An upper bound for the vectorization factors for both
+  /// fixed and scalable vectorization, where the minimum-known number of
+  /// elements is a power-of-2 larger than zero. If scalable vectorization is
+  /// disabled or unsupported, then the scalable part will be equal to
+  /// ElementCount::getScalable(0).
+  FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
+                                           ElementCount UserVF);

  /// \return the maximized element count based on the targets vector
  /// registers and the loop trip-count, but limited to a maximum safe VF.
@ -5676,7 +5679,7 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
  return MaxScalableVF;
 }

-ElementCount
+FixedScalableVFPair
 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
                                                 ElementCount UserVF) {
  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
@ -5742,22 +5745,24 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
  LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
                    << " / " << WidestType << " bits.\n");

-  ElementCount MaxFixedVF = ElementCount::getFixed(1);
+  FixedScalableVFPair Result(ElementCount::getFixed(1),
+                             ElementCount::getScalable(0));
  if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
                                           WidestType, MaxSafeFixedVF))
-    MaxFixedVF = MaxVF;
+    Result.FixedVF = MaxVF;

  if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
                                           WidestType, MaxSafeScalableVF))
-    // FIXME: Return scalable VF as well (to be added in future patch).
-    if (MaxVF.isScalable())
+    if (MaxVF.isScalable()) {
+      Result.ScalableVF = MaxVF;
      LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
                        << "\n");
+    }

-  return MaxFixedVF;
+  return Result;
 }

-Optional<ElementCount>
+FixedScalableVFPair
 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
    // TODO: It may by useful to do since it's still likely to be dynamically
@ -5766,7 +5771,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
        "Not inserting runtime ptr check for divergent target",
        "runtime pointer checks needed. Not enabled for divergent target",
        "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
-    return None;
+    return FixedScalableVFPair::getNone();
  }

  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
@ -5775,7 +5780,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
    reportVectorizationFailure("Single iteration (non) loop",
        "loop trip count is one, irrelevant for vectorization",
        "SingleIterationLoop", ORE, TheLoop);
-    return None;
+    return FixedScalableVFPair::getNone();
  }

  switch (ScalarEpilogueStatus) {
@ -5802,7 +5807,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
    // Bail if runtime checks are required, which are not good when optimising
    // for size.
    if (runtimeChecksRequired())
-      return None;
+      return FixedScalableVFPair::getNone();

    break;
  }
@ -5820,7 +5825,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
      ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
      return computeFeasibleMaxVF(TC, UserVF);
    }
-    return None;
+    return FixedScalableVFPair::getNone();
  }

  // Now try the tail folding
@ -5835,26 +5840,29 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
    InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
  }

-  ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
-  assert(!MaxVF.isScalable() &&
-         "Scalable vectors do not yet support tail folding");
-  assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
-         "MaxVF must be a power of 2");
-  unsigned MaxVFtimesIC =
-      UserIC ? MaxVF.getFixedValue() * UserIC : MaxVF.getFixedValue();
-  // Avoid tail folding if the trip count is known to be a multiple of any VF we
-  // chose.
-  ScalarEvolution *SE = PSE.getSE();
-  const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
-  const SCEV *ExitCount = SE->getAddExpr(
-      BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
-  const SCEV *Rem = SE->getURemExpr(
-      SE->applyLoopGuards(ExitCount, TheLoop),
-      SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
-  if (Rem->isZero()) {
-    // Accept MaxVF if we do not have a tail.
-    LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
-    return MaxVF;
+  FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
+  // Avoid tail folding if the trip count is known to be a multiple of any VF
+  // we chose.
+  // FIXME: The condition below pessimises the case for fixed-width vectors,
+  // when scalable VFs are also candidates for vectorization.
+  if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
+    ElementCount MaxFixedVF = MaxFactors.FixedVF;
+    assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
+           "MaxFixedVF must be a power of 2");
+    unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
+                                   : MaxFixedVF.getFixedValue();
+    ScalarEvolution *SE = PSE.getSE();
+    const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
+    const SCEV *ExitCount = SE->getAddExpr(
+        BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
+    const SCEV *Rem = SE->getURemExpr(
+        SE->applyLoopGuards(ExitCount, TheLoop),
+        SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
+    if (Rem->isZero()) {
+      // Accept MaxFixedVF if we do not have a tail.
+      LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+      return MaxFactors;
+    }
  }

  // If we don't know the precise trip count, or if the trip count that we
@ -5863,7 +5871,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
  // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
  if (Legal->prepareToFoldTailByMasking()) {
    FoldTailByMasking = true;
-    return MaxVF;
+    return MaxFactors;
  }

  // If there was a tail-folding hint/switch, but we can't fold the tail by
@ -5872,12 +5880,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
    LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
                         "scalar epilogue instead.\n");
    ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
-    return MaxVF;
+    return MaxFactors;
  }

  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
    LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
-    return None;
+    return FixedScalableVFPair::getNone();
  }

  if (TC == 0) {
@ -5885,7 +5893,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
        "Unable to calculate the loop count due to complex control flow",
        "unable to calculate the loop count due to complex control flow",
        "UnknownLoopCountComplexCFG", ORE, TheLoop);
-    return None;
+    return FixedScalableVFPair::getNone();
  }

  reportVectorizationFailure(
@ -5894,7 +5902,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
      "Enable vectorization of this loop with '#pragma clang loop "
      "vectorize(enable)' when compiling with -Os/-Oz",
      "NoTailLoopWithOptForSize", ORE, TheLoop);
-  return None;
+  return FixedScalableVFPair::getNone();
 }

 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
@ -7928,8 +7936,8 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
 Optional<VectorizationFactor>
 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
  assert(OrigLoop->isInnermost() && "Inner loop expected.");
-  Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
-  if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
+  FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
+  if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
    return None;

  // Invalidate interleave groups if all blocks of loop will be predicated.
@ -7946,29 +7954,24 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
      CM.invalidateCostModelingDecisions();
  }

-  ElementCount MaxVF = MaybeMaxVF.getValue();
-  assert(MaxVF.isNonZero() && "MaxVF is zero.");
-
-  bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxVF);
-  if (!UserVF.isZero() &&
-      (UserVFIsLegal || (UserVF.isScalable() && MaxVF.isScalable()))) {
-    // FIXME: MaxVF is temporarily used inplace of UserVF for illegal scalable
-    // VFs here, this should be reverted to only use legal UserVFs once the
-    // loop below supports scalable VFs.
-    ElementCount VF = UserVFIsLegal ? UserVF : MaxVF;
+  ElementCount MaxUserVF =
+      UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
+  bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
+  if (!UserVF.isZero() && UserVFIsLegal) {
    LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
-                      << " VF " << VF << ".\n");
-    assert(isPowerOf2_32(VF.getKnownMinValue()) &&
+                      << " VF " << UserVF << ".\n");
+    assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
           "VF needs to be a power of two");
    // Collect the instructions (and their associated costs) that will be more
    // profitable to scalarize.
-    CM.selectUserVectorizationFactor(VF);
+    CM.selectUserVectorizationFactor(UserVF);
    CM.collectInLoopReductions();
-    buildVPlansWithVPRecipes(VF, VF);
+    buildVPlansWithVPRecipes({UserVF}, {UserVF});
    LLVM_DEBUG(printPlans(dbgs()));
-    return {{VF, 0}};
+    return {{UserVF, 0}};
  }

+  ElementCount MaxVF = MaxFactors.FixedVF;
  assert(!MaxVF.isScalable() &&
         "Scalable vectors not yet supported beyond this point");

@ -7987,7 +7990,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {

  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
  LLVM_DEBUG(printPlans(dbgs()));
-  if (MaxVF.isScalar())
+  if (!MaxFactors.hasVector())
    return VectorizationFactor::Disabled();

  // Select the optimal vectorization factor.
--- a/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
+++ b/test/Transforms/LoopVectorize/AArch64/eliminate-tail-predication.ll
@ -0,0 +1,33 @@
+; RUN: opt -loop-vectorize -force-target-instruction-cost=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s 2>&1 | FileCheck %s
+
+; This test currently fails when the LV calculates a maximums safe
+; distance for scalable vectors, because the code to eliminate the tail is
+; pessimistic when scalable vectors are considered. This will be addressed
+; in a future patch, at which point we should be able to un-XFAIL the
+; test. The expected output is to vectorize this loop without predication
+; (and thus have unpredicated vector store).
+; XFAIL: *
+
+; CHECK: store <4 x i32>
+
+target triple = "aarch64"
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+
+define void @f1(i32* %A) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %iv
+  store i32 1, i32* %arrayidx, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp ne i64 %iv.next, 1024
+  br i1 %exitcond, label %for.body, label %exit
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-features"="+sve" }