[PGO][PGSO] Add profile guided size optimization to loop vectorization legality.

2025-01-31 20:51:52 +01:00 · 2020-07-20 15:11:38 -07:00 · 2020-07-20 15:11:38 -07:00 · d9addf9fa8
commit d9addf9fa8
parent d00a5b3d7b
4 changed files with 106 additions and 15 deletions
--- a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@ -202,9 +202,10 @@ public:
      Function *F, std::function<const LoopAccessInfo &(Loop &)> *GetLAA,
      LoopInfo *LI, OptimizationRemarkEmitter *ORE,
      LoopVectorizationRequirements *R, LoopVectorizeHints *H, DemandedBits *DB,
-      AssumptionCache *AC)
+      AssumptionCache *AC, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI)
      : TheLoop(L), LI(LI), PSE(PSE), TTI(TTI), TLI(TLI), DT(DT),
-        GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC) {}
+        GetLAA(GetLAA), ORE(ORE), Requirements(R), Hints(H), DB(DB), AC(AC),
+        BFI(BFI), PSI(PSI) {}

  /// ReductionList contains the reduction descriptors for all
  /// of the reductions that were found in the loop.
@ -478,6 +479,10 @@ private:
  /// Assume instructions in predicated blocks must be dropped if the CFG gets
  /// flattened.
  SmallPtrSet<Instruction *, 8> ConditionalAssumes;
+
+  /// BFI and PSI are used to check for profile guided size optimizations.
+  BlockFrequencyInfo *BFI;
+  ProfileSummaryInfo *PSI;
 };

 } // namespace llvm
--- a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@ -20,6 +20,7 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"

 using namespace llvm;
@ -412,7 +413,11 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
  const ValueToValueMap &Strides =
      getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();

-  bool CanAddPredicate = !TheLoop->getHeader()->getParent()->hasOptSize();
+  Function *F = TheLoop->getHeader()->getParent();
+  bool OptForSize = F->hasOptSize() ||
+                    llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
+                                                PGSOQueryType::IRPass);
+  bool CanAddPredicate = !OptForSize;
  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false);
  if (Stride == 1 || Stride == -1)
    return Stride;
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -395,11 +395,19 @@ public:
                      const TargetTransformInfo *TTI, AssumptionCache *AC,
                      OptimizationRemarkEmitter *ORE, unsigned VecWidth,
                      unsigned UnrollFactor, LoopVectorizationLegality *LVL,
-                      LoopVectorizationCostModel *CM)
+                      LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+                      ProfileSummaryInfo *PSI)
      : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
        AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
        Builder(PSE.getSE()->getContext()),
-        VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
+        VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
+        BFI(BFI), PSI(PSI) {
+    // Query this against the original loop and save it here because the profile
+    // of the original loop header may change as the transformation happens.
+    OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
+        OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
+  }
+
  virtual ~InnerLoopVectorizer() = default;

  /// Create a new empty loop. Unlink the old loop and connect the new one.
@ -779,6 +787,14 @@ protected:
  // Vector of original scalar PHIs whose corresponding widened PHIs need to be
  // fixed up at the end of vector code generation.
  SmallVector<PHINode *, 8> OrigPHIsToFix;
+
+  /// BFI and PSI are used to check for profile guided size optimizations.
+  BlockFrequencyInfo *BFI;
+  ProfileSummaryInfo *PSI;
+
+  // Whether this loop should be optimized for size based on profile guided size
+  // optimizatios.
+  bool OptForSizeBasedOnProfile;
 };

 class InnerLoopUnroller : public InnerLoopVectorizer {
@ -789,9 +805,10 @@ public:
                    const TargetTransformInfo *TTI, AssumptionCache *AC,
                    OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
                    LoopVectorizationLegality *LVL,
-                    LoopVectorizationCostModel *CM)
+                    LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
+                    ProfileSummaryInfo *PSI)
      : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
-                            UnrollFactor, LVL, CM) {}
+                            UnrollFactor, LVL, CM, BFI, PSI) {}

 private:
  Value *getBroadcastInstrs(Value *V) override;
@ -2754,7 +2771,8 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
    if (C->isZero())
      return;

-  assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
+  assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
+           OptForSizeBasedOnProfile) &&
         "Cannot SCEV check stride or overflow when optimizing for size");

  SCEVCheckBlock->setName("vector.scevcheck");
@ -2800,7 +2818,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
  assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
                            "claimed checks are required");

-  if (MemCheckBlock->getParent()->hasOptSize()) {
+  if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
    assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
           "Cannot emit memory checks when optimizing for size, unless forced "
           "to vectorize.");
@ -7729,7 +7747,7 @@ static bool processLoopInVPlanNativePath(
  LVP.setBestPlan(VF.Width, 1);

  InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
-                         &CM);
+                         &CM, BFI, PSI);
  LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                    << L->getHeader()->getParent()->getName() << "\"\n");
  LVP.executePlan(LB, DT);
@ -7793,7 +7811,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
  // Check if it is legal to vectorize the loop.
  LoopVectorizationRequirements Requirements(*ORE);
  LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
-                                &Requirements, &Hints, DB, AC);
+                                &Requirements, &Hints, DB, AC, BFI, PSI);
  if (!LVL.canVectorize(EnableVPlanNativePath)) {
    LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
    Hints.emitRemarkWithHints();
@ -7993,8 +8011,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
    assert(IC > 1 && "interleave count should not be 1 or 0");
    // If we decided that it is not legal to vectorize the loop, then
    // interleave it.
-    InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
-                               &CM);
+    InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
+                               BFI, PSI);
    LVP.executePlan(Unroller, DT);

    ORE->emit([&]() {
@ -8006,7 +8024,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
  } else {
    // If we decided that it is *legal* to vectorize the loop, then do it.
    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
-                           &LVL, &CM);
+                           &LVL, &CM, BFI, PSI);
    LVP.executePlan(LB, DT);
    ++LoopsVectorized;

--- a/test/Transforms/LoopVectorize/optsize.ll
+++ b/test/Transforms/LoopVectorize/optsize.ll
@ -121,6 +121,38 @@ for.body29:
  br i1 %cmp26, label %for.body29, label %for.cond.cleanup28
 }

+define void @pr43371_pgso() !prof !14 {
+;
+; CHECK-LABEL: @pr43371_pgso
+; CHECK-NOT:   vector.scevcheck
+;
+; We do not want to generate SCEV predicates when optimising for size, because
+; that will lead to extra code generation such as the SCEV overflow runtime
+; checks. Not generating SCEV predicates can still result in vectorisation as
+; the non-consecutive loads/stores can be scalarized:
+;
+; CHECK: vector.body:
+; CHECK: store i16 0, i16* %{{.*}}, align 1
+; CHECK: store i16 0, i16* %{{.*}}, align 1
+; CHECK: br i1 {{.*}}, label %vector.body
+;
+entry:
+  br label %for.body29
+
+for.cond.cleanup28:
+  unreachable
+
+for.body29:
+  %i24.0170 = phi i16 [ 0, %entry], [ %inc37, %for.body29]
+  %add33 = add i16 undef, %i24.0170
+  %idxprom34 = zext i16 %add33 to i32
+  %arrayidx35 = getelementptr [2592 x i16], [2592 x i16] * @cm_array, i32 0, i32 %idxprom34
+  store i16 0, i16 * %arrayidx35, align 1
+  %inc37 = add i16 %i24.0170, 1
+  %cmp26 = icmp ult i16 %inc37, 756
+  br i1 %cmp26, label %for.body29, label %for.cond.cleanup28
+}
+
 ; PR45526: don't vectorize with fold-tail if first-order-recurrence is live-out.
 ;
 define i32 @pr45526() optsize {
@ -154,6 +186,37 @@ exit:
  ret i32 %for
 }

+define i32 @pr45526_pgso() !prof !14 {
+;
+; CHECK-LABEL: @pr45526_pgso
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   br label %loop
+; CHECK-EMPTY:
+; CHECK-NEXT: loop:
+; CHECK-NEXT:   %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ]
+; CHECK-NEXT:   %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ]
+; CHECK-NEXT:   %pivPlus1 = add nuw nsw i32 %piv, 1
+; CHECK-NEXT:   %cond = icmp ult i32 %piv, 510
+; CHECK-NEXT:   br i1 %cond, label %loop, label %exit
+; CHECK-EMPTY:
+; CHECK-NEXT: exit:
+; CHECK-NEXT:   %for.lcssa = phi i32 [ %for, %loop ]
+; CHECK-NEXT:   ret i32 %for.lcssa
+;
+entry:
+  br label %loop
+
+loop:
+  %piv = phi i32 [ 0, %entry ], [ %pivPlus1, %loop ]
+  %for = phi i32 [ 5, %entry ], [ %pivPlus1, %loop ]
+  %pivPlus1 = add nuw nsw i32 %piv, 1
+  %cond = icmp ult i32 %piv, 510
+  br i1 %cond, label %loop, label %exit
+
+exit:
+  ret i32 %for
+}
+
 ; PR46228: Vectorize w/o versioning for unit stride under optsize and enabled
 ; vectorization.

@ -190,7 +253,7 @@ define void @stride1(i16* noalias %B, i32 %BStride) optsize {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !21
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph: