New pass: guard widening

Summary: Implement guard widening in LLVM. Description from GuardWidening.cpp: The semantics of the `@llvm.experimental.guard` intrinsic lets LLVM transform it so that it fails more often that it did before the transform. This optimization is called "widening" and can be used hoist and common runtime checks in situations like these: ``` %cmp0 = 7 u< Length call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ] call @unknown_side_effects() %cmp1 = 9 u< Length call @llvm.experimental.guard(i1 %cmp1) [ "deopt"(...) ] ... ``` to ``` %cmp0 = 9 u< Length call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ] call @unknown_side_effects() ... ``` If `%cmp0` is false, `@llvm.experimental.guard` will "deoptimize" back to a generic implementation of the same function, which will have the correct semantics from that point onward. It is always _legal_ to deoptimize (so replacing `%cmp0` with false is "correct"), though it may not always be profitable to do so. NB! This pass is a work in progress. It hasn't been tuned to be "production ready" yet. It is known to have quadriatic running time and will not scale to large numbers of guards Reviewers: reames, atrick, bogner, apilipenko, nlewycky Subscribers: mcrosier, llvm-commits Differential Revision: http://reviews.llvm.org/D20143 llvm-svn: 269997
2024-11-26 04:32:44 +01:00 · 2016-05-18 22:55:34 +00:00 · 2016-05-18 22:55:34 +00:00 · 479a57ca4a
commit 479a57ca4a
parent 7cac814db8
10 changed files with 813 additions and 0 deletions
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@ -142,6 +142,7 @@ void initializeExpandISelPseudosPass(PassRegistry&);
 void initializeForceFunctionAttrsLegacyPassPass(PassRegistry&);
 void initializeGCMachineCodeAnalysisPass(PassRegistry&);
 void initializeGCModuleInfoPass(PassRegistry&);
+void initializeGuardWideningLegacyPassPass(PassRegistry&);
 void initializeGVNLegacyPassPass(PassRegistry&);
 void initializeGlobalDCELegacyPassPass(PassRegistry&);
 void initializeGlobalOptLegacyPassPass(PassRegistry&);
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@ -99,6 +99,7 @@ namespace {
      (void) llvm::createGlobalDCEPass();
      (void) llvm::createGlobalOptimizerPass();
      (void) llvm::createGlobalsAAWrapperPass();
+      (void) llvm::createGuardWideningPass();
      (void) llvm::createIPConstantPropagationPass();
      (void) llvm::createIPSCCPPass();
      (void) llvm::createInductiveRangeCheckEliminationPass();
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@ -81,6 +81,16 @@ FunctionPass *createDeadStoreEliminationPass();
 //
 FunctionPass *createAggressiveDCEPass();

+
+//===----------------------------------------------------------------------===//
+//
+// GuardWidening - An optimization over the @llvm.experimental.guard intrinsic
+// that (optimistically) combines multiple guards into one to have fewer checks
+// at runtime.
+//
+FunctionPass *createGuardWideningPass();
+
+
 //===----------------------------------------------------------------------===//
 //
 // BitTrackingDCE - This pass uses a bit-tracking DCE algorithm in order to
--- a/include/llvm/Transforms/Scalar/GuardWidening.h
+++ b/include/llvm/Transforms/Scalar/GuardWidening.h
@ -0,0 +1,32 @@
+//===- GuardWidening.h - ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Guard widening is an optimization over the @llvm.experimental.guard intrinsic
+// that (optimistically) combines multiple guards into one to have fewer checks
+// at runtime.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_TRANSFORMS_SCALAR_GUARD_WIDENING_H
+#define LLVM_TRANSFORMS_SCALAR_GUARD_WIDENING_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Function;
+
+struct GuardWideningPass : public PassInfoMixin<GuardWideningPass> {
+  PreservedAnalyses run(Function &F, AnalysisManager<Function> &AM);
+};
+}
+
+
+#endif  // LLVM_TRANSFORMS_SCALAR_GUARD_WIDENING_H
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@ -66,6 +66,7 @@
 #include "llvm/Transforms/Scalar/DCE.h"
 #include "llvm/Transforms/Scalar/DeadStoreElimination.h"
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/Transforms/Scalar/GuardWidening.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/LoopRotation.h"
 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
--- a/lib/Passes/PassRegistry.def
+++ b/lib/Passes/PassRegistry.def
@ -118,6 +118,7 @@ FUNCTION_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 FUNCTION_PASS("no-op-function", NoOpFunctionPass())
 FUNCTION_PASS("loweratomic", LowerAtomicPass())
 FUNCTION_PASS("lower-expect", LowerExpectIntrinsicPass())
+FUNCTION_PASS("guard-widening", GuardWideningPass())
 FUNCTION_PASS("gvn", GVN())
 FUNCTION_PASS("print", PrintFunctionPass(dbgs()))
 FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs()))
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@ -10,6 +10,7 @@ add_llvm_library(LLVMScalarOpts
  EarlyCSE.cpp
  FlattenCFGPass.cpp
  Float2Int.cpp
+  GuardWidening.cpp
  GVN.cpp
  InductiveRangeCheckElimination.cpp
  IndVarSimplify.cpp
--- a/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/lib/Transforms/Scalar/GuardWidening.cpp
@ -0,0 +1,428 @@
+//===- GuardWidening.cpp - ---- Guard widening ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the guard widening pass.  The semantics of the
+// @llvm.experimental.guard intrinsic lets LLVM transform it so that it fails
+// more often that it did before the transform.  This optimization is called
+// "widening" and can be used hoist and common runtime checks in situations like
+// these:
+//
+//    %cmp0 = 7 u< Length
+//    call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ]
+//    call @unknown_side_effects()
+//    %cmp1 = 9 u< Length
+//    call @llvm.experimental.guard(i1 %cmp1) [ "deopt"(...) ]
+//    ...
+//
+// =>
+//
+//    %cmp0 = 9 u< Length
+//    call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ]
+//    call @unknown_side_effects()
+//    ...
+//
+// If %cmp0 is false, @llvm.experimental.guard will "deoptimize" back to a
+// generic implementation of the same function, which will have the correct
+// semantics from that point onward.  It is always _legal_ to deoptimize (so
+// replacing %cmp0 with false is "correct"), though it may not always be
+// profitable to do so.
+//
+// NB! This pass is a work in progress.  It hasn't been tuned to be "production
+// ready" yet.  It is known to have quadriatic running time and will not scale
+// to large numbers of guards
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/GuardWidening.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "guard-widening"
+
+namespace {
+
+class GuardWideningImpl {
+  DominatorTree &DT;
+  PostDominatorTree &PDT;
+  LoopInfo &LI;
+
+  /// The set of guards whose conditions have been widened into dominating
+  /// guards.
+  SmallVector<IntrinsicInst *, 16> EliminatedGuards;
+
+  /// The set of guards which have been widened to include conditions to other
+  /// guards.
+  DenseSet<IntrinsicInst *> WidenedGuards;
+
+  /// Try to eliminate guard \p Guard by widening it into an earlier dominating
+  /// guard.  \p DFSI is the DFS iterator on the dominator tree that is
+  /// currently visiting the block containing \p Guard, and \p GuardsPerBlock
+  /// maps BasicBlocks to the set of guards seen in that block.
+  bool eliminateGuardViaWidening(
+      IntrinsicInst *Guard, const df_iterator<DomTreeNode *> &DFSI,
+      const DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> &
+          GuardsPerBlock);
+
+  /// Used to keep track of which widening potential is more effective.
+  enum WideningScore {
+    /// Don't widen.
+    WS_IllegalOrNegative,
+
+    /// Widening is performance neutral as far as the cycles spent in check
+    /// conditions goes (but can still help, e.g., code layout, having less
+    /// deopt state).
+    WS_Neutral,
+
+    /// Widening is profitable.
+    WS_Positive,
+
+    /// Widening is very profitable.  Not significantly different from \c
+    /// WS_Positive, except by the order.
+    WS_VeryPositive
+  };
+
+  static StringRef scoreTypeToString(WideningScore WS);
+
+  /// Compute the score for widening the condition in \p DominatedGuard
+  /// (contained in \p DominatedGuardLoop) into \p DominatingGuard (contained in
+  /// \p DominatingGuardLoop).
+  WideningScore computeWideningScore(IntrinsicInst *DominatedGuard,
+                                     Loop *DominatedGuardLoop,
+                                     IntrinsicInst *DominatingGuard,
+                                     Loop *DominatingGuardLoop);
+
+  /// Helper to check if \p V can be hoisted to \p InsertPos.
+  bool isAvailableAt(Value *V, Instruction *InsertPos) {
+    SmallPtrSet<Instruction *, 8> Visited;
+    return isAvailableAt(V, InsertPos, Visited);
+  }
+
+  bool isAvailableAt(Value *V, Instruction *InsertPos,
+                     SmallPtrSetImpl<Instruction *> &Visited);
+
+  /// Helper to hoist \p V to \p InsertPos.  Guaranteed to succeed if \c
+  /// isAvailableAt returned true.
+  void makeAvailableAt(Value *V, Instruction *InsertPos);
+
+  /// Common helper used by \c widenGuard and \c isWideningCondProfitable.  Try
+  /// to generate an expression computing the logical AND of \p Cond0 and \p
+  /// Cond1.  Return true if the expression computing the AND is only as
+  /// expensive as computing one of the two. If \p InsertPt is true then
+  /// actually generate the resulting expression, make it available at \p
+  /// InsertPt and return it in \p Result (else no change to the IR is made).
+  bool widenCondCommon(Value *Cond0, Value *Cond1, Instruction *InsertPt,
+                       Value *&Result);
+
+  /// Can we compute the logical AND of \p Cond0 and \p Cond1 for the price of
+  /// computing only one of the two expressions?
+  bool isWideningCondProfitable(Value *Cond0, Value *Cond1) {
+    Value *ResultUnused;
+    return widenCondCommon(Cond0, Cond1, /*InsertPt=*/nullptr, ResultUnused);
+  }
+
+  /// Widen \p ToWiden to fail if \p NewCondition is false (in addition to
+  /// whatever it is already checking).
+  void widenGuard(IntrinsicInst *ToWiden, Value *NewCondition) {
+    Value *Result;
+    widenCondCommon(ToWiden->getArgOperand(0), NewCondition, ToWiden, Result);
+    ToWiden->setArgOperand(0, Result);
+  }
+
+public:
+  explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree &PDT,
+                             LoopInfo &LI)
+      : DT(DT), PDT(PDT), LI(LI) {}
+
+  /// The entry point for this pass.
+  bool run();
+};
+
+struct GuardWideningLegacyPass : public FunctionPass {
+  static char ID;
+  GuardWideningPass Impl;
+
+  GuardWideningLegacyPass() : FunctionPass(ID) {
+    initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    return GuardWideningImpl(
+               getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+               getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree(),
+               getAnalysis<LoopInfoWrapperPass>().getLoopInfo()).run();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+  }
+};
+
+}
+
+bool GuardWideningImpl::run() {
+  using namespace llvm::PatternMatch;
+
+  DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> GuardsInBlock;
+  bool Changed = false;
+
+  for (auto DFI = df_begin(DT.getRootNode()), DFE = df_end(DT.getRootNode());
+       DFI != DFE; ++DFI) {
+    auto *BB = (*DFI)->getBlock();
+    auto &CurrentList = GuardsInBlock[BB];
+
+    for (auto &I : *BB)
+      if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>()))
+        CurrentList.push_back(cast<IntrinsicInst>(&I));
+
+    for (auto *II : CurrentList)
+      Changed |= eliminateGuardViaWidening(II, DFI, GuardsInBlock);
+  }
+
+  for (auto *II : EliminatedGuards)
+    if (!WidenedGuards.count(II))
+      II->eraseFromParent();
+
+  return Changed;
+}
+
+bool GuardWideningImpl::eliminateGuardViaWidening(
+    IntrinsicInst *GuardInst, const df_iterator<DomTreeNode *> &DFSI,
+    const DenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 8>> &
+        GuardsInBlock) {
+  IntrinsicInst *BestSoFar = nullptr;
+  auto BestScoreSoFar = WS_IllegalOrNegative;
+  auto *GuardInstLoop = LI.getLoopFor(GuardInst->getParent());
+
+  // In the set of dominating guards, find the one we can merge GuardInst with
+  // for the most profit.
+  for (unsigned i = 0, e = DFSI.getPathLength(); i != e; ++i) {
+    auto *CurBB = DFSI.getPath(i)->getBlock();
+    auto *CurLoop = LI.getLoopFor(CurBB);
+    assert(GuardsInBlock.count(CurBB) && "Must have been populated by now!");
+    const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second;
+
+    auto I = GuardsInCurBB.begin();
+    auto E = GuardsInCurBB.end();
+
+#ifndef NDEBUG
+    {
+      unsigned Index = 0;
+      for (auto &I : *CurBB) {
+        if (Index == GuardsInCurBB.size())
+          break;
+        if (GuardsInCurBB[Index] == &I)
+          Index++;
+      }
+      assert(Index == GuardsInCurBB.size() &&
+             "Guards expected to be in order!");
+    }
+#endif
+
+    assert((i == (e - 1)) == (GuardInst->getParent() == CurBB) && "Bad DFS?");
+
+    if (i == (e - 1)) {
+      // Corner case: make sure we're only looking at guards strictly dominating
+      // GuardInst when visiting GuardInst->getParent().
+      auto NewEnd = std::find(I, E, GuardInst);
+      assert(NewEnd != E && "GuardInst not in its own block?");
+      E = NewEnd;
+    }
+
+    for (auto *Candidate : make_range(I, E)) {
+      auto Score =
+          computeWideningScore(GuardInst, GuardInstLoop, Candidate, CurLoop);
+      DEBUG(dbgs() << "Score between " << *GuardInst->getArgOperand(0)
+                   << " and " << *Candidate->getArgOperand(0) << " is "
+                   << scoreTypeToString(Score) << "\n");
+      if (Score > BestScoreSoFar) {
+        BestScoreSoFar = Score;
+        BestSoFar = Candidate;
+      }
+    }
+  }
+
+  if (BestScoreSoFar == WS_IllegalOrNegative) {
+    DEBUG(dbgs() << "Did not eliminate guard " << *GuardInst << "\n");
+    return false;
+  }
+
+  assert(BestSoFar != GuardInst && "Should have never visited same guard!");
+  assert(DT.dominates(BestSoFar, GuardInst) && "Should be!");
+
+  DEBUG(dbgs() << "Widening " << *GuardInst << " into " << *BestSoFar
+               << " with score " << scoreTypeToString(BestScoreSoFar) << "\n");
+  widenGuard(BestSoFar, GuardInst->getArgOperand(0));
+  GuardInst->setArgOperand(0, ConstantInt::getTrue(GuardInst->getContext()));
+  EliminatedGuards.push_back(GuardInst);
+  WidenedGuards.insert(BestSoFar);
+  return true;
+}
+
+GuardWideningImpl::WideningScore GuardWideningImpl::computeWideningScore(
+    IntrinsicInst *DominatedGuard, Loop *DominatedGuardLoop,
+    IntrinsicInst *DominatingGuard, Loop *DominatingGuardLoop) {
+  bool HoistingOutOfLoop = false;
+
+  if (DominatingGuardLoop != DominatedGuardLoop) {
+    if (DominatingGuardLoop &&
+        !DominatingGuardLoop->contains(DominatedGuardLoop))
+      return WS_IllegalOrNegative;
+
+    HoistingOutOfLoop = true;
+  }
+
+  if (!isAvailableAt(DominatedGuard->getArgOperand(0), DominatingGuard))
+    return WS_IllegalOrNegative;
+
+  bool HoistingOutOfIf =
+      !PDT.dominates(DominatedGuard->getParent(), DominatingGuard->getParent());
+
+  if (isWideningCondProfitable(DominatedGuard->getArgOperand(0),
+                               DominatingGuard->getArgOperand(0)))
+    return HoistingOutOfLoop ? WS_VeryPositive : WS_Positive;
+
+  if (HoistingOutOfLoop)
+    return WS_Positive;
+
+  return HoistingOutOfIf ? WS_IllegalOrNegative : WS_Neutral;
+}
+
+bool GuardWideningImpl::isAvailableAt(Value *V, Instruction *Loc,
+                                      SmallPtrSetImpl<Instruction *> &Visited) {
+  auto *Inst = dyn_cast<Instruction>(V);
+  if (!Inst || DT.dominates(Inst, Loc) || Visited.count(Inst))
+    return true;
+
+  if (!isSafeToSpeculativelyExecute(Inst, Loc, &DT) ||
+      Inst->mayReadFromMemory())
+    return false;
+
+  Visited.insert(Inst);
+
+  // We only want to go _up_ the dominance chain when recursing.
+  assert(!isa<PHINode>(Loc) &&
+         "PHIs should return false for isSafeToSpeculativelyExecute");
+  assert(DT.isReachableFromEntry(Inst->getParent()) &&
+         "We did a DFS from the block entry!");
+  return all_of(Inst->operands(),
+                [&](Value *Op) { return isAvailableAt(Op, Loc, Visited); });
+}
+
+void GuardWideningImpl::makeAvailableAt(Value *V, Instruction *Loc) {
+  auto *Inst = dyn_cast<Instruction>(V);
+  if (!Inst || DT.dominates(Inst, Loc))
+    return;
+
+  assert(isSafeToSpeculativelyExecute(Inst, Loc, &DT) &&
+         !Inst->mayReadFromMemory() && "Should've checked with isAvailableAt!");
+
+  for (Value *Op : Inst->operands())
+    makeAvailableAt(Op, Loc);
+
+  Inst->moveBefore(Loc);
+}
+
+bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1,
+                                        Instruction *InsertPt, Value *&Result) {
+  using namespace llvm::PatternMatch;
+
+  {
+    // L >u C0 && L >u C1  ->  L >u max(C0, C1)
+    ConstantInt *RHS0, *RHS1;
+    Value *LHS;
+    ICmpInst::Predicate Pred0, Pred1;
+    if (match(Cond0, m_ICmp(Pred0, m_Value(LHS), m_ConstantInt(RHS0))) &&
+        match(Cond1, m_ICmp(Pred1, m_Specific(LHS), m_ConstantInt(RHS1)))) {
+
+      // TODO: This logic should be generalized and refactored into a new
+      // Constant::getEquivalentICmp helper.
+      if (Pred0 == ICmpInst::ICMP_NE && RHS0->isZero())
+        Pred0 = ICmpInst::ICMP_UGT;
+      if (Pred1 == ICmpInst::ICMP_NE && RHS1->isZero())
+        Pred1 = ICmpInst::ICMP_UGT;
+
+      if (Pred0 == ICmpInst::ICMP_UGT && Pred1 == ICmpInst::ICMP_UGT) {
+        if (InsertPt) {
+          ConstantInt *NewRHS =
+              RHS0->getValue().ugt(RHS1->getValue()) ? RHS0 : RHS1;
+          Result = new ICmpInst(InsertPt, ICmpInst::ICMP_UGT, LHS, NewRHS,
+                                "wide.chk");
+        }
+
+        return true;
+      }
+    }
+  }
+
+  // Base case -- just logical-and the two conditions together.
+
+  if (InsertPt) {
+    makeAvailableAt(Cond0, InsertPt);
+    makeAvailableAt(Cond1, InsertPt);
+
+    Result = BinaryOperator::CreateAnd(Cond0, Cond1, "wide.chk", InsertPt);
+  }
+
+  // We were not able to compute Cond0 AND Cond1 for the price of one.
+  return false;
+}
+
+PreservedAnalyses GuardWideningPass::run(Function &F,
+                                         AnalysisManager<Function> &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+  bool Changed = GuardWideningImpl(DT, PDT, LI).run();
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
+  switch (WS) {
+  case WS_IllegalOrNegative:
+    return "IllegalOrNegative";
+  case WS_Neutral:
+    return "Neutral";
+  case WS_Positive:
+    return "Positive";
+  case WS_VeryPositive:
+    return "VeryPositive";
+  }
+
+  llvm_unreachable("Fully covered switch above!");
+}
+
+char GuardWideningLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards",
+                    false, false)
+
+FunctionPass *llvm::createGuardWideningPass() {
+  return new GuardWideningLegacyPass();
+}
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@ -41,6 +41,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
  initializeDeadInstEliminationPass(Registry);
  initializeScalarizerPass(Registry);
  initializeDSELegacyPassPass(Registry);
+  initializeGuardWideningLegacyPassPass(Registry);
  initializeGVNLegacyPassPass(Registry);
  initializeEarlyCSELegacyPassPass(Registry);
  initializeFlattenCFGPassPass(Registry);
--- a/test/Transforms/GuardWidening/basic.ll
+++ b/test/Transforms/GuardWidening/basic.ll
@ -0,0 +1,337 @@
+; RUN: opt -S -guard-widening < %s        | FileCheck %s
+; RUN: opt -S -passes=guard-widening < %s | FileCheck %s
+
+declare void @llvm.experimental.guard(i1,...)
+
+; Basic test case: we wide the first check to check both the
+; conditions.
+define void @f_0(i1 %cond_0, i1 %cond_1) {
+; CHECK-LABEL: @f_0(
+entry:
+; CHECK:  %wide.chk = and i1 %cond_0, %cond_1
+; CHECK:  call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ]
+; CHECK:  ret void
+
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ]
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ]
+  ret void
+}
+
+; Same as @f_0, but with using a more general notion of postdominance.
+define void @f_1(i1 %cond_0, i1 %cond_1) {
+; CHECK-LABEL: @f_1(
+entry:
+; CHECK:  %wide.chk = and i1 %cond_0, %cond_1
+; CHECK:  call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ]
+; CHECK:  br i1 undef, label %left, label %right
+
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ]
+  br i1 undef, label %left, label %right
+
+left:
+  br label %merge
+
+right:
+  br label %merge
+
+merge:
+; CHECK: merge:
+; CHECK-NOT: call void (i1, ...) @llvm.experimental.guard(
+; CHECK: ret void
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ]
+  ret void
+}
+
+; Like @f_1, but we have some code we need to hoist before we can
+; widen a dominanting check.
+define void @f_2(i32 %a, i32 %b) {
+; CHECK-LABEL: @f_2(
+entry:
+; CHECK:  %cond_0 = icmp ult i32 %a, 10
+; CHECK:  %cond_1 = icmp ult i32 %b, 10
+; CHECK:  %wide.chk = and i1 %cond_0, %cond_1
+; CHECK:  call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ]
+; CHECK:  br i1 undef, label %left, label %right
+
+  %cond_0 = icmp ult i32 %a, 10
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ]
+  br i1 undef, label %left, label %right
+
+left:
+  br label %merge
+
+right:
+  br label %merge
+
+merge:
+  %cond_1 = icmp ult i32 %b, 10
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ]
+  ret void
+}
+
+; Negative test: don't hoist stuff out of control flow
+; indiscriminately, since that can make us do more work than needed.
+define void @f_3(i32 %a, i32 %b) {
+; CHECK-LABEL: @f_3(
+entry:
+; CHECK:  %cond_0 = icmp ult i32 %a, 10
+; CHECK:  call void (i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ]
+; CHECK:  br i1 undef, label %left, label %right
+
+  %cond_0 = icmp ult i32 %a, 10
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ]
+  br i1 undef, label %left, label %right
+
+left:
+; CHECK: left:
+; CHECK:   %cond_1 = icmp ult i32 %b, 10
+; CHECK:   call void (i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ]
+; CHECK:   ret void
+
+  %cond_1 = icmp ult i32 %b, 10
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ]
+  ret void
+
+right:
+  ret void
+}
+
+; But hoisting out of control flow is fine if it makes a loop computed
+; condition loop invariant.  This behavior may require some tuning in
+; the future.
+define void @f_4(i32 %a, i32 %b) {
+; CHECK-LABEL: @f_4(
+entry:
+; CHECK:  %cond_0 = icmp ult i32 %a, 10
+; CHECK:  %cond_1 = icmp ult i32 %b, 10
+; CHECK:  %wide.chk = and i1 %cond_0, %cond_1
+; CHECK:  call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ]
+; CHECK:  br i1 undef, label %loop, label %leave
+
+  %cond_0 = icmp ult i32 %a, 10
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ]
+  br i1 undef, label %loop, label %leave
+
+loop:
+  %cond_1 = icmp ult i32 %b, 10
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ]
+  br i1 undef, label %loop, label %leave
+
+leave:
+  ret void
+}
+
+; Hoisting out of control flow is also fine if we can widen the
+; dominating check without doing any extra work.
+define void @f_5(i32 %a) {
+; CHECK-LABEL: @f_5(
+entry:
+; CHECK:  %wide.chk = icmp ugt i32 %a, 10
+; CHECK:  call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ]
+; CHECK:  br i1 undef, label %left, label %right
+
+  %cond_0 = icmp ugt i32 %a, 7
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ]
+  br i1 undef, label %left, label %right
+
+left:
+  %cond_1 = icmp ugt i32 %a, 10
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ]
+  ret void
+
+right:
+  ret void
+}
+
+; Negative test: the load from %a can be safely speculated to before
+; the first guard, but there is no guarantee that it will produce the
+; same value.
+define void @f_6(i1* dereferenceable(32) %a, i1* %b, i1 %unknown) {
+; CHECK-LABEL: @f_6(
+; CHECK: call void (i1, ...) @llvm.experimental.guard(
+; CHECK: call void (i1, ...) @llvm.experimental.guard(
+; CHECK: ret void
+entry:
+  %cond_0 = load i1, i1* %a
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ]
+  store i1 %unknown, i1* %b
+  %cond_1 = load i1, i1* %a
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ]
+  ret void
+}
+
+; All else equal, we try to widen the earliest guard we can.  This
+; heuristic can use some tuning.
+define void @f_7(i32 %a, i1* %cond_buf) {
+; CHECK-LABEL: @f_7(
+entry:
+; CHECK:  %cond_1 = load volatile i1, i1* %cond_buf
+; CHECK:  %cond_3 = icmp ult i32 %a, 7
+; CHECK:  %wide.chk = and i1 %cond_1, %cond_3
+; CHECK:  call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ]
+; CHECK:  %cond_2 = load volatile i1, i1* %cond_buf
+; CHECK:  call void (i1, ...) @llvm.experimental.guard(i1 %cond_2) [ "deopt"() ]
+; CHECK:  br i1 undef, label %left, label %right
+
+  %cond_1 = load volatile i1, i1* %cond_buf
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ]
+  %cond_2 = load volatile i1, i1* %cond_buf
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_2) [ "deopt"() ]
+  br i1 undef, label %left, label %right
+
+left:
+  %cond_3 = icmp ult i32 %a, 7
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_3) [ "deopt"() ]
+  br label %left
+
+right:
+  ret void
+}
+
+; In this case the earliest dominating guard is in a loop, and we
+; don't want to put extra work in there.  This heuristic can use some
+; tuning.
+define void @f_8(i32 %a, i1 %cond_1, i1 %cond_2) {
+; CHECK-LABEL: @f_8(
+entry:
+  br label %loop
+
+loop:
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ]
+  br i1 undef, label %loop, label %leave
+
+leave:
+; CHECK: leave:
+; CHECK:  %cond_3 = icmp ult i32 %a, 7
+; CHECK:  %wide.chk = and i1 %cond_2, %cond_3
+; CHECK:  call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ]
+; CHECK:  br i1 undef, label %loop2, label %leave2
+
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_2) [ "deopt"() ]
+  br i1 undef, label %loop2, label %leave2
+
+loop2:
+  %cond_3 = icmp ult i32 %a, 7
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_3) [ "deopt"() ]
+  br label %loop2
+
+leave2:
+  ret void
+}
+
+; In cases like these where there isn't any "obviously profitable"
+; widening sites, we refuse to do anything.
+define void @f_9(i32 %a, i1 %cond_0, i1 %cond_1) {
+; CHECK-LABEL: @f_9(
+entry:
+  br label %first_loop
+
+first_loop:
+; CHECK: first_loop:
+; CHECK:  call void (i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ]
+; CHECK:  br i1 undef, label %first_loop, label %second_loop
+
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ]
+  br i1 undef, label %first_loop, label %second_loop
+
+second_loop:
+; CHECK: second_loop:
+; CHECK:   call void (i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ]
+; CHECK:   br label %second_loop
+
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ]
+  br label %second_loop
+}
+
+; Same situation as in @f_9: no "obviously profitable" widening sites,
+; so we refuse to do anything.
+define void @f_10(i32 %a, i1 %cond_0, i1 %cond_1) {
+; CHECK-LABEL: @f_10(
+entry:
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK:  call void (i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ]
+; CHECK:  br i1 undef, label %loop, label %no_loop
+
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ]
+  br i1 undef, label %loop, label %no_loop
+
+no_loop:
+; CHECK: no_loop:
+; CHECK:  call void (i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ]
+; CHECK:  ret void
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ]
+  ret void
+}
+
+; With guards in loops, we're okay hoisting out the guard into the
+; containing loop.
+define void @f_11(i32 %a, i1 %cond_0, i1 %cond_1) {
+; CHECK-LABEL: @f_11(
+entry:
+  br label %inner
+
+inner:
+; CHECK: inner:
+; CHECK:  %wide.chk = and i1 %cond_0, %cond_1
+; CHECK:  call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ]
+; CHECK:  br i1 undef, label %inner, label %outer
+
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_0) [ "deopt"() ]
+  br i1 undef, label %inner, label %outer
+
+outer:
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond_1) [ "deopt"() ]
+  br label %inner
+}
+
+; Checks that we are adequately guarded against exponential-time
+; behavior when hoisting code.
+define void @f_12(i32 %a0) {
+; CHECK-LABEL: @f_12
+
+; Eliding the earlier 29 multiplications for brevity
+; CHECK:  %a30 = mul i32 %a29, %a29
+; CHECK-NEXT:  %cond = trunc i32 %a30 to i1
+; CHECK-NEXT:  %wide.chk = and i1 true, %cond
+; CHECK-NEXT:  call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ]
+; CHECK-NEXT:  ret void
+
+entry:
+  call void(i1, ...) @llvm.experimental.guard(i1 true) [ "deopt"() ]
+  %a1 = mul i32 %a0, %a0
+  %a2 = mul i32 %a1, %a1
+  %a3 = mul i32 %a2, %a2
+  %a4 = mul i32 %a3, %a3
+  %a5 = mul i32 %a4, %a4
+  %a6 = mul i32 %a5, %a5
+  %a7 = mul i32 %a6, %a6
+  %a8 = mul i32 %a7, %a7
+  %a9 = mul i32 %a8, %a8
+  %a10 = mul i32 %a9, %a9
+  %a11 = mul i32 %a10, %a10
+  %a12 = mul i32 %a11, %a11
+  %a13 = mul i32 %a12, %a12
+  %a14 = mul i32 %a13, %a13
+  %a15 = mul i32 %a14, %a14
+  %a16 = mul i32 %a15, %a15
+  %a17 = mul i32 %a16, %a16
+  %a18 = mul i32 %a17, %a17
+  %a19 = mul i32 %a18, %a18
+  %a20 = mul i32 %a19, %a19
+  %a21 = mul i32 %a20, %a20
+  %a22 = mul i32 %a21, %a21
+  %a23 = mul i32 %a22, %a22
+  %a24 = mul i32 %a23, %a23
+  %a25 = mul i32 %a24, %a24
+  %a26 = mul i32 %a25, %a25
+  %a27 = mul i32 %a26, %a26
+  %a28 = mul i32 %a27, %a27
+  %a29 = mul i32 %a28, %a28
+  %a30 = mul i32 %a29, %a29
+  %cond = trunc i32 %a30 to i1
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+  ret void
+}