Add Loop Sink pass to reverse the LICM based of basic block frequency.

Summary: LICM may hoist instructions to preheader speculatively. Before code generation, we need to sink down the hoisted instructions inside to loop if it's beneficial. This pass is a reverse of LICM: looking at instructions in preheader and sinks the instruction to basic blocks inside the loop body if basic block frequency is smaller than the preheader frequency. Reviewers: hfinkel, davidxl, chandlerc Subscribers: anna, modocache, mgorny, beanz, reames, dberlin, chandlerc, mcrosier, junbuml, sanjoy, mzolotukhin, llvm-commits Differential Revision: https://reviews.llvm.org/D22778 llvm-svn: 285308
2024-11-25 20:23:11 +01:00 · 2016-10-27 16:30:08 +00:00 · 2016-10-27 16:30:08 +00:00 · ecb41605f5
commit ecb41605f5
parent 119c41482e
10 changed files with 703 additions and 14 deletions
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@ -168,6 +168,7 @@ void initializeIntervalPartitionPass(PassRegistry&);
 void initializeJumpThreadingPass(PassRegistry&);
 void initializeLCSSAWrapperPassPass(PassRegistry &);
 void initializeLegacyLICMPassPass(PassRegistry&);
+void initializeLegacyLoopSinkPassPass(PassRegistry&);
 void initializeLazyBranchProbabilityInfoPassPass(PassRegistry&);
 void initializeLazyBlockFrequencyInfoPassPass(PassRegistry&);
 void initializeLazyValueInfoWrapperPassPass(PassRegistry&);
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@ -112,6 +112,7 @@ namespace {
      (void) llvm::createInternalizePass();
      (void) llvm::createLCSSAPass();
      (void) llvm::createLICMPass();
+      (void) llvm::createLoopSinkPass();
      (void) llvm::createLazyValueInfoPass();
      (void) llvm::createLoopExtractorPass();
      (void) llvm::createLoopInterchangePass();
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@ -138,6 +138,13 @@ FunctionPass *createInstructionCombiningPass(bool ExpensiveCombines = true);
 //
 Pass *createLICMPass();

+//===----------------------------------------------------------------------===//
+//
+// LoopSink - This pass sinks invariants from preheader to loop body where
+// frequency is lower than loop preheader.
+//
+Pass *createLoopSinkPass();
+
 //===----------------------------------------------------------------------===//
 //
 // LoopInterchange - This pass interchanges loops to provide a more
--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@ -467,6 +467,17 @@ void addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
 /// All loop passes should call this as part of implementing their \c
 /// getAnalysisUsage.
 void getLoopAnalysisUsage(AnalysisUsage &AU);
+
+/// Returns true if the hoister and sinker can handle this instruction.
+/// If SafetyInfo is null, we are checking for sinking instructions from
+/// preheader to loop body (no speculation).
+/// If SafetyInfo is not null, we are checking for hoisting/sinking
+/// instructions from loop body to preheader/exit. Check if the instruction
+/// can execute specultatively.
+///
+bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
+                        Loop *CurLoop, AliasSetTracker *CurAST,
+                        LoopSafetyInfo *SafetyInfo);
 }

 #endif
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@ -17,6 +17,7 @@ add_llvm_library(LLVMScalarOpts
  IndVarSimplify.cpp
  JumpThreading.cpp
  LICM.cpp
+  LoopSink.cpp
  LoadCombine.cpp
  LoopDeletion.cpp
  LoopDataPrefetch.cpp
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@ -100,10 +100,6 @@ static Instruction *
 CloneInstructionInExitBlock(Instruction &I, BasicBlock &ExitBlock, PHINode &PN,
                            const LoopInfo *LI,
                            const LoopSafetyInfo *SafetyInfo);
-static bool canSinkOrHoistInst(Instruction &I, AliasAnalysis *AA,
-                               DominatorTree *DT,
-                               Loop *CurLoop, AliasSetTracker *CurAST,
-                               LoopSafetyInfo *SafetyInfo);

 namespace {
 struct LoopInvariantCodeMotion {
@ -436,16 +432,9 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
        SafetyInfo->BlockColors = colorEHFunclets(*Fn);
 }

-/// Returns true if the hoister and sinker can handle this instruction.
-/// If SafetyInfo is nullptr, we are checking for sinking instructions from
-/// preheader to loop body (no speculation).
-/// If SafetyInfo is not nullptr, we are checking for hoisting/sinking
-/// instructions from loop body to preheader/exit. Check if the instruction
-/// can execute specultatively.
-///
-bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
-                        Loop *CurLoop, AliasSetTracker *CurAST,
-                        LoopSafetyInfo *SafetyInfo) {
+bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
+                              Loop *CurLoop, AliasSetTracker *CurAST,
+                              LoopSafetyInfo *SafetyInfo) {
  // Loads have extra constraints we have to verify before we can hoist them.
  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
    if (!LI->isUnordered())
--- a/lib/Transforms/Scalar/LoopSink.cpp
+++ b/lib/Transforms/Scalar/LoopSink.cpp
@ -0,0 +1,328 @@
+//===-- LoopSink.cpp - Loop Sink Pass ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does the inverse transformation of what LICM does.
+// It traverses all of the instructions in the loop's preheader and sinks
+// them to the loop body where frequency is lower than the loop's preheader.
+// This pass is a reverse-transformation of LICM. It differs from the Sink
+// pass in the following ways:
+//
+// * It only handles sinking of instructions from the loop's preheader to the
+//   loop's body
+// * It uses alias set tracker to get more accurate alias info
+// * It uses block frequency info to find the optimal sinking locations
+//
+// Overall algorithm:
+//
+// For I in Preheader:
+//   InsertBBs = BBs that uses I
+//   For BB in sorted(LoopBBs):
+//     DomBBs = BBs in InsertBBs that are dominated by BB
+//     if freq(DomBBs) > freq(BB)
+//       InsertBBs = UseBBs - DomBBs + BB
+//   For BB in InsertBBs:
+//     Insert I at BB's beginning
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopPassManager.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loopsink"
+
+STATISTIC(NumLoopSunk, "Number of instructions sunk into loop");
+STATISTIC(NumLoopSunkCloned, "Number of cloned instructions sunk into loop");
+
+static cl::opt<unsigned> SinkFrequencyPercentThreshold(
+    "sink-freq-percent-threshold", cl::Hidden, cl::init(90),
+    cl::desc("Do not sink instructions that require cloning unless they "
+             "execute less than this percent of the time."));
+
+static cl::opt<unsigned> MaxNumberOfUseBBsForSinking(
+    "max-uses-for-sinking", cl::Hidden, cl::init(30),
+    cl::desc("Do not sink instructions that have too many uses."));
+
+/// Return adjusted total frequency of \p BBs.
+///
+/// * If there is only one BB, sinking instruction will not introduce code
+///   size increase. Thus there is no need to adjust the frequency.
+/// * If there are more than one BB, sinking would lead to code size increase.
+///   In this case, we add some "tax" to the total frequency to make it harder
+///   to sink. E.g.
+///     Freq(Preheader) = 100
+///     Freq(BBs) = sum(50, 49) = 99
+///   Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to
+///   BBs as the difference is too small to justify the code size increase.
+///   To model this, The adjusted Freq(BBs) will be:
+///     AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold%
+static BlockFrequency adjustedSumFreq(SmallPtrSetImpl<BasicBlock *> &BBs,
+                                      BlockFrequencyInfo &BFI) {
+  BlockFrequency T = 0;
+  for (BasicBlock *B : BBs)
+    T += BFI.getBlockFreq(B);
+  if (BBs.size() > 1)
+    T /= BranchProbability(SinkFrequencyPercentThreshold, 100);
+  return T;
+}
+
+/// Return a set of basic blocks to insert sinked instructions.
+///
+/// The returned set of basic blocks (BBsToSinkInto) should satisfy:
+///
+/// * Inside the loop \p L
+/// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto
+///   that domintates the UseBB
+/// * Has minimum total frequency that is no greater than preheader frequency
+///
+/// The purpose of the function is to find the optimal sinking points to
+/// minimize execution cost, which is defined as "sum of frequency of
+/// BBsToSinkInto".
+/// As a result, the returned BBsToSinkInto needs to have minimum total
+/// frequency.
+/// Additionally, if the total frequency of BBsToSinkInto exceeds preheader
+/// frequency, the optimal solution is not sinking (return empty set).
+///
+/// \p ColdLoopBBs is used to help find the optimal sinking locations.
+/// It stores a list of BBs that is:
+///
+/// * Inside the loop \p L
+/// * Has a frequency no larger than the loop's preheader
+/// * Sorted by BB frequency
+///
+/// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()).
+/// To avoid expensive computation, we cap the maximum UseBBs.size() in its
+/// caller.
+static SmallPtrSet<BasicBlock *, 2>
+findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs,
+                  const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
+                  DominatorTree &DT, BlockFrequencyInfo &BFI) {
+  SmallPtrSet<BasicBlock *, 2> BBsToSinkInto;
+  if (UseBBs.size() == 0)
+    return BBsToSinkInto;
+
+  BBsToSinkInto.insert(UseBBs.begin(), UseBBs.end());
+  SmallPtrSet<BasicBlock *, 2> BBsDominatedByColdestBB;
+
+  // For every iteration:
+  //   * Pick the ColdestBB from ColdLoopBBs
+  //   * Find the set BBsDominatedByColdestBB that satisfy:
+  //     - BBsDominatedByColdestBB is a subset of BBsToSinkInto
+  //     - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB
+  //   * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove
+  //     BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to
+  //     BBsToSinkInto
+  for (BasicBlock *ColdestBB : ColdLoopBBs) {
+    BBsDominatedByColdestBB.clear();
+    for (BasicBlock *SinkedBB : BBsToSinkInto)
+      if (DT.dominates(ColdestBB, SinkedBB))
+        BBsDominatedByColdestBB.insert(SinkedBB);
+    if (BBsDominatedByColdestBB.size() == 0)
+      continue;
+    if (adjustedSumFreq(BBsDominatedByColdestBB, BFI) >
+        BFI.getBlockFreq(ColdestBB)) {
+      for (BasicBlock *DominatedBB : BBsDominatedByColdestBB) {
+        BBsToSinkInto.erase(DominatedBB);
+      }
+      BBsToSinkInto.insert(ColdestBB);
+    }
+  }
+
+  // If the total frequency of BBsToSinkInto is larger than preheader frequency,
+  // do not sink.
+  if (adjustedSumFreq(BBsToSinkInto, BFI) >
+      BFI.getBlockFreq(L.getLoopPreheader()))
+    BBsToSinkInto.clear();
+  return BBsToSinkInto;
+}
+
+// Sinks \p I from the loop \p L's preheader to its uses. Returns true if
+// sinking is successful.
+// \p LoopBlockNumber is used to sort the insertion blocks to ensure
+// determinism.
+static bool sinkInstruction(Loop &L, Instruction &I,
+                            const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
+                            const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber,
+                            LoopInfo &LI, DominatorTree &DT,
+                            BlockFrequencyInfo &BFI) {
+  // Compute the set of blocks in loop L which contain a use of I.
+  SmallPtrSet<BasicBlock *, 2> BBs;
+  for (auto &U : I.uses()) {
+    Instruction *UI = cast<Instruction>(U.getUser());
+    // We cannot sink I to PHI-uses.
+    if (dyn_cast<PHINode>(UI))
+      return false;
+    // We cannot sink I if it has uses outside of the loop.
+    if (!L.contains(LI.getLoopFor(UI->getParent())))
+      return false;
+    BBs.insert(UI->getParent());
+  }
+
+  // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max
+  // BBs.size() to avoid expensive computation.
+  // FIXME: Handle code size growth for min_size and opt_size.
+  if (BBs.size() > MaxNumberOfUseBBsForSinking)
+    return false;
+
+  // Find the set of BBs that we should insert a copy of I.
+  SmallPtrSet<BasicBlock *, 2> BBsToSinkInto =
+      findBBsToSinkInto(L, BBs, ColdLoopBBs, DT, BFI);
+  if (BBsToSinkInto.empty())
+    return false;
+
+  // Copy the final BBs into a vector and sort them using the total ordering
+  // of the loop block numbers as iterating the set doesn't give a useful
+  // order. No need to stable sort as the block numbers are a total ordering.
+  SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
+  SortedBBsToSinkInto.insert(SortedBBsToSinkInto.begin(), BBsToSinkInto.begin(),
+                             BBsToSinkInto.end());
+  std::sort(SortedBBsToSinkInto.begin(), SortedBBsToSinkInto.end(),
+            [&](BasicBlock *A, BasicBlock *B) {
+              return *LoopBlockNumber.find(A) < *LoopBlockNumber.find(B);
+            });
+
+  BasicBlock *MoveBB = *SortedBBsToSinkInto.begin();
+  // FIXME: Optimize the efficiency for cloned value replacement. The current
+  //        implementation is O(SortedBBsToSinkInto.size() * I.num_uses()).
+  for (BasicBlock *N : SortedBBsToSinkInto) {
+    if (N == MoveBB)
+      continue;
+    // Clone I and replace its uses.
+    Instruction *IC = I.clone();
+    IC->setName(I.getName());
+    IC->insertBefore(&*N->getFirstInsertionPt());
+    // Replaces uses of I with IC in N
+    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;) {
+      Use &U = *UI++;
+      auto *I = cast<Instruction>(U.getUser());
+      if (I->getParent() == N)
+        U.set(IC);
+    }
+    // Replaces uses of I with IC in blocks dominated by N
+    replaceDominatedUsesWith(&I, IC, DT, N);
+    DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
+                 << '\n');
+    NumLoopSunkCloned++;
+  }
+  DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
+  NumLoopSunk++;
+  I.moveBefore(&*MoveBB->getFirstInsertionPt());
+
+  return true;
+}
+
+/// Sinks instructions from loop's preheader to the loop body if the
+/// sum frequency of inserted copy is smaller than preheader's frequency.
+static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
+                                          DominatorTree &DT,
+                                          BlockFrequencyInfo &BFI,
+                                          ScalarEvolution *SE) {
+  BasicBlock *Preheader = L.getLoopPreheader();
+  if (!Preheader)
+    return false;
+
+  const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader);
+  // If there are no basic blocks with lower frequency than the preheader then
+  // we can avoid the detailed analysis as we will never find profitable sinking
+  // opportunities.
+  if (all_of(L.blocks(), [&](const BasicBlock *BB) {
+        return BFI.getBlockFreq(BB) > PreheaderFreq;
+      }))
+    return false;
+
+  bool Changed = false;
+  AliasSetTracker CurAST(AA);
+
+  // Compute alias set.
+  for (BasicBlock *BB : L.blocks())
+    CurAST.add(*BB);
+
+  // Sort loop's basic blocks by frequency
+  SmallVector<BasicBlock *, 10> ColdLoopBBs;
+  SmallDenseMap<BasicBlock *, int, 16> LoopBlockNumber;
+  int i = 0;
+  for (BasicBlock *B : L.blocks())
+    if (BFI.getBlockFreq(B) < BFI.getBlockFreq(L.getLoopPreheader())) {
+      ColdLoopBBs.push_back(B);
+      LoopBlockNumber[B] = ++i;
+    }
+  std::stable_sort(ColdLoopBBs.begin(), ColdLoopBBs.end(),
+                   [&](BasicBlock *A, BasicBlock *B) {
+                     return BFI.getBlockFreq(A) < BFI.getBlockFreq(B);
+                   });
+
+  // Traverse preheader's instructions in reverse order becaue if A depends
+  // on B (A appears after B), A needs to be sinked first before B can be
+  // sinked.
+  for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) {
+    Instruction *I = &*II++;
+    if (!L.hasLoopInvariantOperands(I) ||
+        !canSinkOrHoistInst(*I, &AA, &DT, &L, &CurAST, nullptr))
+      continue;
+    if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI))
+      Changed = true;
+  }
+
+  if (Changed && SE)
+    SE->forgetLoopDispositions(&L);
+  return Changed;
+}
+
+namespace {
+struct LegacyLoopSinkPass : public LoopPass {
+  static char ID;
+  LegacyLoopSinkPass() : LoopPass(ID) {
+    initializeLegacyLoopSinkPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+    if (skipLoop(L))
+      return false;
+
+    auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+    return sinkLoopInvariantInstructions(
+        *L, getAnalysis<AAResultsWrapperPass>().getAAResults(),
+        getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
+        getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+        getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(),
+        SE ? &SE->getSE() : nullptr);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    getLoopAnalysisUsage(AU);
+  }
+};
+}
+
+char LegacyLoopSinkPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, false)
+
+Pass *llvm::createLoopSinkPass() { return new LegacyLoopSinkPass(); }
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@ -51,6 +51,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
  initializeIndVarSimplifyLegacyPassPass(Registry);
  initializeJumpThreadingPass(Registry);
  initializeLegacyLICMPassPass(Registry);
+  initializeLegacyLoopSinkPassPass(Registry);
  initializeLoopDataPrefetchLegacyPassPass(Registry);
  initializeLoopDeletionLegacyPassPass(Registry);
  initializeLoopAccessLegacyAnalysisPass(Registry);
@ -141,6 +142,10 @@ void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
  unwrap(PM)->add(createJumpThreadingPass());
 }

+void LLVMAddLoopSinkPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopSinkPass());
+}
+
 void LLVMAddLICMPass(LLVMPassManagerRef PM) {
  unwrap(PM)->add(createLICMPass());
 }
--- a/test/Transforms/LICM/loopsink.ll
+++ b/test/Transforms/LICM/loopsink.ll
@ -0,0 +1,286 @@
+; RUN: opt -S -loop-sink < %s | FileCheck %s
+
+@g = global i32 0, align 4
+
+;     b1
+;    /  \
+;   b2  b6
+;  /  \  |
+; b3  b4 |
+;  \  /  |
+;   b5   |
+;    \  /
+;     b7
+; preheader: 1000
+; b2: 15
+; b3: 7
+; b4: 7
+; Sink load to b2
+; CHECK: t1
+; CHECK: .b2:
+; CHECK: load i32, i32* @g
+; CHECK: .b3:
+; CHECK-NOT:  load i32, i32* @g
+define i32 @t1(i32, i32) #0 {
+  %3 = icmp eq i32 %1, 0
+  br i1 %3, label %.exit, label %.preheader
+
+.preheader:
+  %invariant = load i32, i32* @g
+  br label %.b1
+
+.b1:
+  %iv = phi i32 [ %t7, %.b7 ], [ 0, %.preheader ]
+  %c1 = icmp sgt i32 %iv, %0
+  br i1 %c1, label %.b2, label %.b6, !prof !1
+
+.b2:
+  %c2 = icmp sgt i32 %iv, 1
+  br i1 %c2, label %.b3, label %.b4
+
+.b3:
+  %t3 = sub nsw i32 %invariant, %iv
+  br label %.b5
+
+.b4:
+  %t4 = add nsw i32 %invariant, %iv
+  br label %.b5
+
+.b5:
+  %p5 = phi i32 [ %t3, %.b3 ], [ %t4, %.b4 ]
+  %t5 = mul nsw i32 %p5, 5
+  br label %.b7
+
+.b6:
+  %t6 = add nsw i32 %iv, 100
+  br label %.b7
+
+.b7:
+  %p7 = phi i32 [ %t6, %.b6 ], [ %t5, %.b5 ]
+  %t7 = add nuw nsw i32 %iv, 1
+  %c7 = icmp eq i32 %t7, %p7
+  br i1 %c7, label %.b1, label %.exit, !prof !3
+
+.exit:
+  ret i32 10
+}
+
+;     b1
+;    /  \
+;   b2  b6
+;  /  \  |
+; b3  b4 |
+;  \  /  |
+;   b5   |
+;    \  /
+;     b7
+; preheader: 500
+; b1: 16016
+; b3: 8
+; b6: 8
+; Sink load to b3 and b6
+; CHECK: t2
+; CHECK: .preheader:
+; CHECK-NOT: load i32, i32* @g
+; CHECK: .b3:
+; CHECK: load i32, i32* @g
+; CHECK: .b4:
+; CHECK: .b6:
+; CHECK: load i32, i32* @g
+; CHECK: .b7:
+define i32 @t2(i32, i32) #0 {
+  %3 = icmp eq i32 %1, 0
+  br i1 %3, label %.exit, label %.preheader
+
+.preheader:
+  %invariant = load i32, i32* @g
+  br label %.b1
+
+.b1:
+  %iv = phi i32 [ %t7, %.b7 ], [ 0, %.preheader ]
+  %c1 = icmp sgt i32 %iv, %0
+  br i1 %c1, label %.b2, label %.b6, !prof !2
+
+.b2:
+  %c2 = icmp sgt i32 %iv, 1
+  br i1 %c2, label %.b3, label %.b4, !prof !1
+
+.b3:
+  %t3 = sub nsw i32 %invariant, %iv
+  br label %.b5
+
+.b4:
+  %t4 = add nsw i32 5, %iv
+  br label %.b5
+
+.b5:
+  %p5 = phi i32 [ %t3, %.b3 ], [ %t4, %.b4 ]
+  %t5 = mul nsw i32 %p5, 5
+  br label %.b7
+
+.b6:
+  %t6 = add nsw i32 %iv, %invariant
+  br label %.b7
+
+.b7:
+  %p7 = phi i32 [ %t6, %.b6 ], [ %t5, %.b5 ]
+  %t7 = add nuw nsw i32 %iv, 1
+  %c7 = icmp eq i32 %t7, %p7
+  br i1 %c7, label %.b1, label %.exit, !prof !3
+
+.exit:
+  ret i32 10
+}
+
+;     b1
+;    /  \
+;   b2  b6
+;  /  \  |
+; b3  b4 |
+;  \  /  |
+;   b5   |
+;    \  /
+;     b7
+; preheader: 500
+; b3: 8
+; b5: 16008
+; Do not sink load from preheader.
+; CHECK: t3
+; CHECK: .preheader:
+; CHECK: load i32, i32* @g
+; CHECK: .b1:
+; CHECK-NOT: load i32, i32* @g
+define i32 @t3(i32, i32) #0 {
+  %3 = icmp eq i32 %1, 0
+  br i1 %3, label %.exit, label %.preheader
+
+.preheader:
+  %invariant = load i32, i32* @g
+  br label %.b1
+
+.b1:
+  %iv = phi i32 [ %t7, %.b7 ], [ 0, %.preheader ]
+  %c1 = icmp sgt i32 %iv, %0
+  br i1 %c1, label %.b2, label %.b6, !prof !2
+
+.b2:
+  %c2 = icmp sgt i32 %iv, 1
+  br i1 %c2, label %.b3, label %.b4, !prof !1
+
+.b3:
+  %t3 = sub nsw i32 %invariant, %iv
+  br label %.b5
+
+.b4:
+  %t4 = add nsw i32 5, %iv
+  br label %.b5
+
+.b5:
+  %p5 = phi i32 [ %t3, %.b3 ], [ %t4, %.b4 ]
+  %t5 = mul nsw i32 %p5, %invariant
+  br label %.b7
+
+.b6:
+  %t6 = add nsw i32 %iv, 5
+  br label %.b7
+
+.b7:
+  %p7 = phi i32 [ %t6, %.b6 ], [ %t5, %.b5 ]
+  %t7 = add nuw nsw i32 %iv, 1
+  %c7 = icmp eq i32 %t7, %p7
+  br i1 %c7, label %.b1, label %.exit, !prof !3
+
+.exit:
+  ret i32 10
+}
+
+; For single-BB loop with <=1 avg trip count, sink load to b1
+; CHECK: t4
+; CHECK: .preheader:
+; CHECK-not: load i32, i32* @g
+; CHECK: .b1:
+; CHECK: load i32, i32* @g
+; CHECK: .exit:
+define i32 @t4(i32, i32) #0 {
+.preheader:
+  %invariant = load i32, i32* @g
+  br label %.b1
+
+.b1:
+  %iv = phi i32 [ %t1, %.b1 ], [ 0, %.preheader ]
+  %t1 = add nsw i32 %invariant, %iv
+  %c1 = icmp sgt i32 %iv, %0
+  br i1 %c1, label %.b1, label %.exit, !prof !1
+
+.exit:
+  ret i32 10
+}
+
+;     b1
+;    /  \
+;   b2  b6
+;  /  \  |
+; b3  b4 |
+;  \  /  |
+;   b5   |
+;    \  /
+;     b7
+; preheader: 1000
+; b2: 15
+; b3: 7
+; b4: 7
+; There is alias store in loop, do not sink load
+; CHECK: t5
+; CHECK: .preheader:
+; CHECK: load i32, i32* @g
+; CHECK: .b1:
+; CHECK-NOT: load i32, i32* @g
+define i32 @t5(i32, i32*) #0 {
+  %3 = icmp eq i32 %0, 0
+  br i1 %3, label %.exit, label %.preheader
+
+.preheader:
+  %invariant = load i32, i32* @g
+  br label %.b1
+
+.b1:
+  %iv = phi i32 [ %t7, %.b7 ], [ 0, %.preheader ]
+  %c1 = icmp sgt i32 %iv, %0
+  br i1 %c1, label %.b2, label %.b6, !prof !1
+
+.b2:
+  %c2 = icmp sgt i32 %iv, 1
+  br i1 %c2, label %.b3, label %.b4
+
+.b3:
+  %t3 = sub nsw i32 %invariant, %iv
+  br label %.b5
+
+.b4:
+  %t4 = add nsw i32 %invariant, %iv
+  br label %.b5
+
+.b5:
+  %p5 = phi i32 [ %t3, %.b3 ], [ %t4, %.b4 ]
+  %t5 = mul nsw i32 %p5, 5
+  br label %.b7
+
+.b6:
+  %t6 = call i32 @foo()
+  br label %.b7
+
+.b7:
+  %p7 = phi i32 [ %t6, %.b6 ], [ %t5, %.b5 ]
+  %t7 = add nuw nsw i32 %iv, 1
+  %c7 = icmp eq i32 %t7, %p7
+  br i1 %c7, label %.b1, label %.exit, !prof !3
+
+.exit:
+  ret i32 10
+}
+
+declare i32 @foo()
+
+!1 = !{!"branch_weights", i32 1, i32 2000}
+!2 = !{!"branch_weights", i32 2000, i32 1}
+!3 = !{!"branch_weights", i32 100, i32 1}
--- a/test/Transforms/LICM/sink.ll
+++ b/test/Transforms/LICM/sink.ll
@ -0,0 +1,60 @@
+; RUN: opt -S -licm < %s | FileCheck %s --check-prefix=CHECK-LICM
+; RUN: opt -S -licm < %s | opt -S -loop-sink | FileCheck %s --check-prefix=CHECK-SINK
+
+; Original source code:
+; int g;
+; int foo(int p, int x) {
+;   for (int i = 0; i != x; i++)
+;     if (__builtin_expect(i == p, 0)) {
+;       x += g; x *= g;
+;     }
+;   return x;
+; }
+;
+; Load of global value g should not be hoisted to preheader.
+
+@g = global i32 0, align 4
+
+define i32 @foo(i32, i32) #0 {
+  %3 = icmp eq i32 %1, 0
+  br i1 %3, label %._crit_edge, label %.lr.ph.preheader
+
+.lr.ph.preheader:
+  br label %.lr.ph
+
+; CHECK-LICM: .lr.ph.preheader:
+; CHECK-LICM: load i32, i32* @g
+; CHECK-LICM: br label %.lr.ph
+
+.lr.ph:
+  %.03 = phi i32 [ %8, %.combine ], [ 0, %.lr.ph.preheader ]
+  %.012 = phi i32 [ %.1, %.combine ], [ %1, %.lr.ph.preheader ]
+  %4 = icmp eq i32 %.03, %0
+  br i1 %4, label %.then, label %.combine, !prof !1
+
+.then:
+  %5 = load i32, i32* @g, align 4
+  %6 = add nsw i32 %5, %.012
+  %7 = mul nsw i32 %6, %5
+  br label %.combine
+
+; CHECK-SINK: .then:
+; CHECK-SINK: load i32, i32* @g
+; CHECK-SINK: br label %.combine
+
+.combine:
+  %.1 = phi i32 [ %7, %.then ], [ %.012, %.lr.ph ]
+  %8 = add nuw nsw i32 %.03, 1
+  %9 = icmp eq i32 %8, %.1
+  br i1 %9, label %._crit_edge.loopexit, label %.lr.ph
+
+._crit_edge.loopexit:
+  %.1.lcssa = phi i32 [ %.1, %.combine ]
+  br label %._crit_edge
+
+._crit_edge:
+  %.01.lcssa = phi i32 [ 0, %2 ], [ %.1.lcssa, %._crit_edge.loopexit ]
+  ret i32 %.01.lcssa
+}
+
+!1 = !{!"branch_weights", i32 1, i32 2000}