[IROutliner] Adding a cost model, and debug option to turn the model off.

This adds a cost model that takes into account the total number of machine instructions to be removed from each region, the number of instructions added by adding a new function with a set of instructions, and the instructions added by handling arguments. Tests not adding flags: llvm/test/Transforms/IROutliner/outlining-cost-model.ll Reviewers: jroelofs, paquette Differential Revision: https://reviews.llvm.org/D87299
2024-11-25 04:02:41 +01:00 · 2020-09-14 16:58:15 -05:00 · 2020-09-14 16:58:15 -05:00 · 2f69ee6563
commit 2f69ee6563
parent 6d7c1c4298
34 changed files with 533 additions and 35 deletions
--- a/include/llvm/Transforms/IPO/IROutliner.h
+++ b/include/llvm/Transforms/IPO/IROutliner.h
@ -145,6 +145,12 @@ struct OutlinableRegion {
  /// function has been extracted, the start and end of the BasicBlock
  /// containing the called function.
  void reattachCandidate();
+
+  /// Get the size of the code removed from the region.
+  ///
+  /// \param [in] TTI - The TargetTransformInfo for the parent function.
+  /// \returns the code size of the region
+  unsigned getBenefit(TargetTransformInfo &TTI);
 };

 /// This class is a pass that identifies similarity in a Module, extracts
@ -201,6 +207,28 @@ private:
  void findAddInputsOutputs(Module &M, OutlinableRegion &Region,
                            DenseSet<unsigned> &NotSame);

+  /// Find the number of instructions that will be removed by extracting the
+  /// OutlinableRegions in \p CurrentGroup.
+  ///
+  /// \param [in] CurrentGroup - The collection of OutlinableRegions to be
+  /// analyzed.
+  /// \returns the number of outlined instructions across all regions.
+  unsigned findBenefitFromAllRegions(OutlinableGroup &CurrentGroup);
+
+  /// Find the number of instructions that will be added by reloading arguments.
+  ///
+  /// \param [in] CurrentGroup - The collection of OutlinableRegions to be
+  /// analyzed.
+  /// \returns the number of added reload instructions across all regions.
+  unsigned findCostOutputReloads(OutlinableGroup &CurrentGroup);
+
+  /// Find the cost and the benefit of \p CurrentGroup and save it back to
+  /// \p CurrentGroup.
+  ///
+  /// \param [in] M - The module being analyzed
+  /// \param [in,out] CurrentGroup - The overall outlined section
+  void findCostBenefit(Module &M, OutlinableGroup &CurrentGroup);
+
  /// Update the output mapping based on the load instruction, and the outputs
  /// of the extracted function.
  ///
@ -229,6 +257,11 @@ private:
                                    std::vector<Function *> &FuncsToRemove,
                                    unsigned &OutlinedFunctionNum);

+  /// If false, we do not worry if the cost is greater than the benefit.  This
+  /// is for debugging and testing, so that we can test small cases to ensure
+  /// that the outlining is being done correctly.
+  bool CostModel = true;
+
  /// The set of outlined Instructions, identified by their location in the
  /// sequential ordering of instructions in a Module.
  DenseSet<unsigned> Outlined;
--- a/lib/Transforms/IPO/IROutliner.cpp
+++ b/lib/Transforms/IPO/IROutliner.cpp
@ -29,6 +29,13 @@
 using namespace llvm;
 using namespace IRSimilarity;

+// This is a debug option to test small pieces of code to ensure that outlining
+// works correctly.
+static cl::opt<bool> NoCostModel(
+    "ir-outlining-no-cost", cl::init(false), cl::ReallyHidden,
+    cl::desc("Debug option to outline greedily, without restriction that "
+             "calculated benefit outweighs cost"));
+
 /// The OutlinableGroup holds all the overarching information for outlining
 /// a set of regions that are structurally similar to one another, such as the
 /// types of the overall function, the output blocks, the sets of stores needed
@ -66,6 +73,13 @@ struct OutlinableGroup {
  /// index in ArgumentTypes is an output argument.
  unsigned NumAggregateInputs = 0;

+  /// The number of instructions that will be outlined by extracting \ref
+  /// Regions.
+  unsigned Benefit = 0;
+  /// The number of added instructions needed for the outlining of the \ref
+  /// Regions.
+  unsigned Cost = 0;
+
  /// For the \ref Regions, we look at every Value.  If it is a constant,
  /// we check whether it is the same in Region.
  ///
@ -213,6 +227,40 @@ constantMatches(Value *V, unsigned GVN,
  return false;
 }

+unsigned OutlinableRegion::getBenefit(TargetTransformInfo &TTI) {
+  InstructionCost Benefit(0);
+
+  // Estimate the benefit of outlining a specific sections of the program.  We
+  // delegate mostly this task to the TargetTransformInfo so that if the target
+  // has specific changes, we can have a more accurate estimate.
+
+  // However, getInstructionCost delegates the code size calculation for
+  // arithmetic instructions to getArithmeticInstrCost in
+  // include/Analysis/TargetTransformImpl.h, where it always estimates that the
+  // code size for a division and remainder instruction to be equal to 4, and
+  // everything else to 1.  This is not an accurate representation of the
+  // division instruction for targets that have a native division instruction.
+  // To be overly conservative, we only add 1 to the number of instructions for
+  // each division instruction.
+  for (Instruction &I : *StartBB) {
+    switch (I.getOpcode()) {
+    case Instruction::FDiv:
+    case Instruction::FRem:
+    case Instruction::SDiv:
+    case Instruction::SRem:
+    case Instruction::UDiv:
+    case Instruction::URem:
+      Benefit += 1;
+      break;
+    default:
+      Benefit += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
+      break;
+    }
+  }
+
+  return *Benefit.getValue();
+}
+
 /// Find whether \p Region matches the global value numbering to Constant
 /// mapping found so far.
 ///
@ -1189,6 +1237,152 @@ void IROutliner::pruneIncompatibleRegions(
  }
 }

+unsigned IROutliner::findBenefitFromAllRegions(OutlinableGroup &CurrentGroup) {
+  unsigned RegionBenefit = 0;
+  for (OutlinableRegion *Region : CurrentGroup.Regions) {
+    TargetTransformInfo &TTI = getTTI(*Region->StartBB->getParent());
+    // We add the number of instructions in the region to the benefit as an
+    // estimate as to how much will be removed.
+    RegionBenefit += Region->getBenefit(TTI);
+    LLVM_DEBUG(dbgs() << "Adding: " << RegionBenefit
+                      << " saved instructions to overfall benefit.\n");
+    CurrentGroup.Benefit += RegionBenefit;
+  }
+
+  return RegionBenefit;
+}
+
+unsigned IROutliner::findCostOutputReloads(OutlinableGroup &CurrentGroup) {
+  unsigned OverallCost = 0;
+  for (OutlinableRegion *Region : CurrentGroup.Regions) {
+    TargetTransformInfo &TTI = getTTI(*Region->StartBB->getParent());
+
+    // Each output incurs a load after the call, so we add that to the cost.
+    for (unsigned OutputGVN : Region->GVNStores) {
+      Optional<Value *> OV = Region->Candidate->fromGVN(OutputGVN);
+      assert(OV.hasValue() && "Could not find value for GVN?");
+      Value *V = OV.getValue();
+      unsigned LoadCost =
+          TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0,
+                              TargetTransformInfo::TCK_CodeSize);
+
+      LLVM_DEBUG(dbgs() << "Adding: " << LoadCost
+                        << " instructions to cost for output of type "
+                        << *V->getType() << "\n");
+      OverallCost += LoadCost;
+    }
+  }
+
+  return OverallCost;
+}
+
+/// Find the extra instructions needed to handle any output values for the
+/// region.
+///
+/// \param [in] M - The Module to outline from.
+/// \param [in] CurrentGroup - The collection of OutlinableRegions to analyze.
+/// \param [in] TTI - The TargetTransformInfo used to collect information for
+/// new instruction costs.
+/// \returns the additional cost to handle the outputs.
+static unsigned findCostForOutputBlocks(Module &M,
+                                        OutlinableGroup &CurrentGroup,
+                                        TargetTransformInfo &TTI) {
+  unsigned OutputCost = 0;
+
+  for (const ArrayRef<unsigned> &OutputUse :
+       CurrentGroup.OutputGVNCombinations) {
+    IRSimilarityCandidate &Candidate = *CurrentGroup.Regions[0]->Candidate;
+    for (unsigned GVN : OutputUse) {
+      Optional<Value *> OV = Candidate.fromGVN(GVN);
+      assert(OV.hasValue() && "Could not find value for GVN?");
+      Value *V = OV.getValue();
+      unsigned StoreCost =
+          TTI.getMemoryOpCost(Instruction::Load, V->getType(), Align(1), 0,
+                              TargetTransformInfo::TCK_CodeSize);
+
+      // An instruction cost is added for each store set that needs to occur for
+      // various output combinations inside the function, plus a branch to
+      // return to the exit block.
+      LLVM_DEBUG(dbgs() << "Adding: " << StoreCost
+                        << " instructions to cost for output of type "
+                        << *V->getType() << "\n");
+      OutputCost += StoreCost;
+    }
+
+    unsigned BranchCost =
+        TTI.getCFInstrCost(Instruction::Br, TargetTransformInfo::TCK_CodeSize);
+    LLVM_DEBUG(dbgs() << "Adding " << BranchCost << " to the current cost for"
+                      << " a branch instruction\n");
+    OutputCost += BranchCost;
+  }
+
+  // If there is more than one output scheme, we must have a comparison and
+  // branch for each different item in the switch statement.
+  if (CurrentGroup.OutputGVNCombinations.size() > 1) {
+    unsigned ComparisonCost = TTI.getCmpSelInstrCost(
+        Instruction::ICmp, Type::getInt32Ty(M.getContext()),
+        Type::getInt32Ty(M.getContext()), CmpInst::BAD_ICMP_PREDICATE,
+        TargetTransformInfo::TCK_CodeSize);
+    unsigned BranchCost =
+        TTI.getCFInstrCost(Instruction::Br, TargetTransformInfo::TCK_CodeSize);
+
+    unsigned DifferentBlocks = CurrentGroup.OutputGVNCombinations.size();
+    unsigned TotalCost = ComparisonCost * BranchCost * DifferentBlocks;
+
+    LLVM_DEBUG(dbgs() << "Adding: " << TotalCost
+                      << " instructions for each switch case for each different"
+                      << " output path in a function\n");
+    OutputCost += TotalCost;
+  }
+
+  return OutputCost;
+}
+
+void IROutliner::findCostBenefit(Module &M, OutlinableGroup &CurrentGroup) {
+  unsigned RegionBenefit = findBenefitFromAllRegions(CurrentGroup);
+  CurrentGroup.Benefit += RegionBenefit;
+  LLVM_DEBUG(dbgs() << "Current Benefit: " << CurrentGroup.Benefit << "\n");
+
+  unsigned OutputReloadCost = findCostOutputReloads(CurrentGroup);
+  CurrentGroup.Cost += OutputReloadCost;
+  LLVM_DEBUG(dbgs() << "Current Cost: " << CurrentGroup.Cost << "\n");
+
+  unsigned AverageRegionBenefit = RegionBenefit / CurrentGroup.Regions.size();
+  unsigned OverallArgumentNum = CurrentGroup.ArgumentTypes.size();
+  unsigned NumRegions = CurrentGroup.Regions.size();
+  TargetTransformInfo &TTI =
+      getTTI(*CurrentGroup.Regions[0]->Candidate->getFunction());
+
+  // We add one region to the cost once, to account for the instructions added
+  // inside of the newly created function.
+  LLVM_DEBUG(dbgs() << "Adding: " << AverageRegionBenefit
+                    << " instructions to cost for body of new function.\n");
+  CurrentGroup.Cost += AverageRegionBenefit;
+  LLVM_DEBUG(dbgs() << "Current Cost: " << CurrentGroup.Cost << "\n");
+
+  // For each argument, we must add an instruction for loading the argument
+  // out of the register and into a value inside of the newly outlined function.
+  LLVM_DEBUG(dbgs() << "Adding: " << OverallArgumentNum
+                    << " instructions to cost for each argument in the new"
+                    << " function.\n");
+  CurrentGroup.Cost += 2 * OverallArgumentNum * TargetTransformInfo::TCC_Basic;
+  LLVM_DEBUG(dbgs() << "Current Cost: " << CurrentGroup.Cost << "\n");
+
+  // Each argument needs to either be loaded into a register or onto the stack.
+  // Some arguments will only be loaded into the stack once the argument
+  // registers are filled.
+  LLVM_DEBUG(dbgs() << "Adding: " << OverallArgumentNum
+                    << " instructions to cost for each argument in the new"
+                    << " function " << NumRegions << " times for the "
+                    << "needed argument handling at the call site.\n");
+  CurrentGroup.Cost +=
+      2 * OverallArgumentNum * TargetTransformInfo::TCC_Basic * NumRegions;
+  LLVM_DEBUG(dbgs() << "Current Cost: " << CurrentGroup.Cost << "\n");
+
+  CurrentGroup.Cost += findCostForOutputBlocks(M, CurrentGroup, TTI);
+  LLVM_DEBUG(dbgs() << "Current Cost: " << CurrentGroup.Cost << "\n");
+}
+
 void IROutliner::updateOutputMapping(OutlinableRegion &Region,
                                     ArrayRef<Value *> Outputs,
                                     LoadInst *LI) {
@ -1348,6 +1542,19 @@ unsigned IROutliner::doOutline(Module &M) {

    CurrentGroup.collectGVNStoreSets(M);

+    if (CostModel)
+      findCostBenefit(M, CurrentGroup);
+
+    // If we are adhering to the cost model, reattach all the candidates
+    if (CurrentGroup.Cost >= CurrentGroup.Benefit && CostModel) {
+      for (OutlinableRegion *OS : CurrentGroup.Regions)
+        OS->reattachCandidate();
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "Outlining regions with cost " << CurrentGroup.Cost
+                      << " and benefit " << CurrentGroup.Benefit << "\n");
+
    // Create functions out of all the sections, and mark them as outlined.
    OutlinedRegions.clear();
    for (OutlinableRegion *OS : CurrentGroup.Regions) {
@ -1377,7 +1584,11 @@ unsigned IROutliner::doOutline(Module &M) {
  return OutlinedFunctionNum;
 }

-bool IROutliner::run(Module &M) { return doOutline(M) > 0; }
+bool IROutliner::run(Module &M) {
+  CostModel = !NoCostModel;
+
+  return doOutline(M) > 0;
+}

 // Pass Manager Boilerplate
 class IROutlinerLegacyPass : public ModulePass {
--- a/test/Transforms/IROutliner/extraction.ll
+++ b/test/Transforms/IROutliner/extraction.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test makes sure we are extracting the found similarity sections
 ; correctly at the call site.
--- a/test/Transforms/IROutliner/illegal-allocas.ll
+++ b/test/Transforms/IROutliner/illegal-allocas.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner  -ir-outlining-no-cost < %s | FileCheck %s

 ; Show that we do not extract allocas, as outlining allocas may cause
 ; inconsistencies with the CodeExtractor's algorithm.
--- a/test/Transforms/IROutliner/illegal-assumes.ll
+++ b/test/Transforms/IROutliner/illegal-assumes.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test ensures that we do not include llvm.assumes.  There are exceptions
 ; in the CodeExtractor's algorithm for llvm.assumes, so we ignore it for now.
--- a/test/Transforms/IROutliner/illegal-branches.ll
+++ b/test/Transforms/IROutliner/illegal-branches.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; Show that we do not extract sections with branches as it would require extra
 ; label and control flow checking.
--- a/test/Transforms/IROutliner/illegal-callbr.ll
+++ b/test/Transforms/IROutliner/illegal-callbr.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test checks that we do not outline callbr instruction since as we do not
 ; outline any control flow change instructions.
--- a/test/Transforms/IROutliner/illegal-calls.ll
+++ b/test/Transforms/IROutliner/illegal-calls.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test checks that we do not outline calls.  Special calls, such as
 ; indirect or nameless calls require extra handling to ensure that there
--- a/test/Transforms/IROutliner/illegal-catchpad.ll
+++ b/test/Transforms/IROutliner/illegal-catchpad.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test checks that catchpad instructions are not outlined even if they
 ; in a similar section.  Dealing with exception handling inside of an outlined
--- a/test/Transforms/IROutliner/illegal-cleanup.ll
+++ b/test/Transforms/IROutliner/illegal-cleanup.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test checks that cleanuppad instructions are not outlined even if they
 ; in a similar section.  Dealing with exception handling inside of an outlined
--- a/test/Transforms/IROutliner/illegal-frozen.ll
+++ b/test/Transforms/IROutliner/illegal-frozen.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; Show that we do not extract freeze instructions, since extra handling is
 ; required to mark any outputs used with freeze.
--- a/test/Transforms/IROutliner/illegal-gep.ll
+++ b/test/Transforms/IROutliner/illegal-gep.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test checks to make sure that we do not outline getelementptr
 ; instructions since we must make extra checks on the final operands.
--- a/test/Transforms/IROutliner/illegal-invoke.ll
+++ b/test/Transforms/IROutliner/illegal-invoke.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test checks that invoke instructions are not outlined even if they
 ; in a similar section.  Outlining does not currently handle control flow
--- a/test/Transforms/IROutliner/illegal-landingpad.ll
+++ b/test/Transforms/IROutliner/illegal-landingpad.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test checks that landingpad instructions are not outlined even if they
 ; in a similar section.  Dealing with exception handling inside of an outlined
--- a/test/Transforms/IROutliner/illegal-memcpy.ll
+++ b/test/Transforms/IROutliner/illegal-memcpy.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test checks that we do not outline memcpy intrinsics since it may require
 ; extra address space checks.
--- a/test/Transforms/IROutliner/illegal-memmove.ll
+++ b/test/Transforms/IROutliner/illegal-memmove.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test checks that we do not outline memcpy intrinsics since it may require
 ; extra address space checks.
--- a/test/Transforms/IROutliner/illegal-memset.ll
+++ b/test/Transforms/IROutliner/illegal-memset.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test checks that we do not outline memset intrinsics since it requires
 ; extra address space checks.
--- a/test/Transforms/IROutliner/illegal-phi-nodes.ll
+++ b/test/Transforms/IROutliner/illegal-phi-nodes.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; Show that we do not extract phi nodes as it would require extra label and
 ; control flow checking.
--- a/test/Transforms/IROutliner/illegal-vaarg.ll
+++ b/test/Transforms/IROutliner/illegal-vaarg.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test ensures that we do not outline vararg instructions or intrinsics, as
 ; they may cause inconsistencies when outlining.
--- a/test/Transforms/IROutliner/legal-debug.ll
+++ b/test/Transforms/IROutliner/legal-debug.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test looks ahecks that debug info is extracted along with the other
 ; instructions.
--- a/test/Transforms/IROutliner/outlining-address-taken.ll
+++ b/test/Transforms/IROutliner/outlining-address-taken.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test shows that we do not outline from basic blocks with their address
 ; taken.
--- a/test/Transforms/IROutliner/outlining-commutative-fp.ll
+++ b/test/Transforms/IROutliner/outlining-commutative-fp.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test checks that floating point commutative instructions are not treated
 ; as commutative.  Even though an ffadd is technically commutative, the order
--- a/test/Transforms/IROutliner/outlining-commutative.ll
+++ b/test/Transforms/IROutliner/outlining-commutative.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test checks that commutative instructions where the operands are
 ; swapped are outlined as the same function.
--- a/test/Transforms/IROutliner/outlining-constants-vs-registers.ll
+++ b/test/Transforms/IROutliner/outlining-constants-vs-registers.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test looks at instances of constants in the different regions. If there
 ; is a register in the same place as a constant in a similar region of code, we
--- a/test/Transforms/IROutliner/outlining-cost-model.ll
+++ b/test/Transforms/IROutliner/outlining-cost-model.ll
@ -0,0 +1,183 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s -check-prefix=NOCOST
+
+; This test checks that we have different results from when the cost model
+; is on versus when it is off.  That is, if the number of instructions needed to
+; handle the arguments is greater than the number of instructions being added,
+; we do not outline.
+
+define void @function1() #0 {
+; CHECK-LABEL: @function1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]])
+; CHECK-NEXT:    ret void
+;
+; NOCOST-LABEL: @function1(
+; NOCOST-NEXT:  entry:
+; NOCOST-NEXT:    [[A:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    [[B:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]])
+; NOCOST-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add i32 %0, %1
+  %mul = mul i32 %0, %1
+  %sub = sub i32 %0, %1
+  %div = sdiv i32 %0, %1
+  %add2 = add i32 %0, %1
+  %mul2 = mul i32 %0, %1
+  %sub2 = sub i32 %0, %1
+  %div2 = sdiv i32 %0, %1
+  ret void
+}
+
+define void @function2() #0 {
+; CHECK-LABEL: @function2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]])
+; CHECK-NEXT:    ret void
+;
+; NOCOST-LABEL: @function2(
+; NOCOST-NEXT:  entry:
+; NOCOST-NEXT:    [[A:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    [[B:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]])
+; NOCOST-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add i32 %0, %1
+  %mul = mul i32 %0, %1
+  %sub = sub i32 %0, %1
+  %div = sdiv i32 %0, %1
+  %add2 = add i32 %0, %1
+  %mul2 = mul i32 %0, %1
+  %sub2 = sub i32 %0, %1
+  %div2 = sdiv i32 %0, %1
+  ret void
+}
+
+define void @function3() #0 {
+; CHECK-LABEL: @function3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 2, i32* [[A]], align 4
+; CHECK-NEXT:    store i32 3, i32* [[B]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[OUTPUT]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[OUTPUT]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[OUTPUT]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[TMP2]], [[ADD]]
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[RESULT]], align 4
+; CHECK-NEXT:    ret void
+;
+; NOCOST-LABEL: @function3(
+; NOCOST-NEXT:  entry:
+; NOCOST-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    [[A:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    [[B:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; NOCOST-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; NOCOST-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; NOCOST-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; NOCOST-NEXT:    call void @outlined_ir_func_1(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]])
+; NOCOST-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; NOCOST-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; NOCOST-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; NOCOST-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; NOCOST-NEXT:    [[TMP0:%.*]] = load i32, i32* [[OUTPUT]], align 4
+; NOCOST-NEXT:    call void @outlined_ir_func_2(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]])
+; NOCOST-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add i32 %0, %1
+  store i32 %add, i32* %output, align 4
+  %2 = load i32, i32* %output, align 4
+  %3 = load i32, i32* %output, align 4
+  %mul = mul i32 %2, %add
+  store i32 %mul, i32* %result, align 4
+  ret void
+}
+
+define void @function4() #0 {
+; CHECK-LABEL: @function4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store i32 2, i32* [[A]], align 4
+; CHECK-NEXT:    store i32 3, i32* [[B]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[B]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[OUTPUT]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[OUTPUT]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[TMP2]], [[ADD]]
+; CHECK-NEXT:    store i32 [[MUL]], i32* [[RESULT]], align 4
+; CHECK-NEXT:    ret void
+;
+; NOCOST-LABEL: @function4(
+; NOCOST-NEXT:  entry:
+; NOCOST-NEXT:    [[DOTLOC:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    [[ADD_LOC:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    [[A:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    [[B:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    [[OUTPUT:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
+; NOCOST-NEXT:    [[LT_CAST:%.*]] = bitcast i32* [[ADD_LOC]] to i8*
+; NOCOST-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST]])
+; NOCOST-NEXT:    [[LT_CAST1:%.*]] = bitcast i32* [[DOTLOC]] to i8*
+; NOCOST-NEXT:    call void @llvm.lifetime.start.p0i8(i64 -1, i8* [[LT_CAST1]])
+; NOCOST-NEXT:    call void @outlined_ir_func_1(i32* [[A]], i32* [[B]], i32* [[OUTPUT]], i32* [[ADD_LOC]], i32* [[DOTLOC]])
+; NOCOST-NEXT:    [[ADD_RELOAD:%.*]] = load i32, i32* [[ADD_LOC]], align 4
+; NOCOST-NEXT:    [[DOTRELOAD:%.*]] = load i32, i32* [[DOTLOC]], align 4
+; NOCOST-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST]])
+; NOCOST-NEXT:    call void @llvm.lifetime.end.p0i8(i64 -1, i8* [[LT_CAST1]])
+; NOCOST-NEXT:    call void @outlined_ir_func_2(i32 [[DOTRELOAD]], i32 [[ADD_RELOAD]], i32* [[RESULT]])
+; NOCOST-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %output = alloca i32, align 4
+  %result = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  %0 = load i32, i32* %a, align 4
+  %1 = load i32, i32* %b, align 4
+  %add = add i32 %0, %1
+  store i32 %add, i32* %output, align 4
+  %2 = load i32, i32* %output, align 4
+  %mul = mul i32 %2, %add
+  store i32 %mul, i32* %result, align 4
+  ret void
+}
--- a/test/Transforms/IROutliner/outlining-debug-statements.ll
+++ b/test/Transforms/IROutliner/outlining-debug-statements.ll
@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+
+; This test makes sure that we do not include debug statements in outlined
+; functions.
+
+define void @outline_dbg1() {
+; CHECK-LABEL: @outline_dbg1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  call void @llvm.dbg.value(metadata i64 0, metadata !14, metadata !DIExpression()), !dbg !14
+  store i32 4, i32* %c, align 4
+  %al = load i32, i32* %a
+  %bl = load i32, i32* %b
+  %cl = load i32, i32* %c
+  ret void
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+define void @outline_dbg2() {
+; CHECK-LABEL: @outline_dbg2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    call void @outlined_ir_func_0(i32* [[A]], i32* [[B]], i32* [[C]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a = alloca i32, align 4
+  %b = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 2, i32* %a, align 4
+  store i32 3, i32* %b, align 4
+  store i32 4, i32* %c, align 4
+  %al = load i32, i32* %a
+  %bl = load i32, i32* %b
+  %cl = load i32, i32* %c
+  ret void
+}
+
+; CHECK: define internal void @outlined_ir_func_0(i32* [[ARG0:%.*]], i32* [[ARG1:%.*]], i32* [[ARG2:%.*]]) #1 {
+; CHECK: entry_to_outline:
+; CHECK-NEXT:    store i32 2, i32* [[ARG0]], align 4
+; CHECK-NEXT:    store i32 3, i32* [[ARG1]], align 4
+; CHECK-NEXT:    store i32 4, i32* [[ARG2]], align 4
+; CHECK-NEXT:    [[AL:%.*]] = load i32, i32* [[ARG0]], align 4
+; CHECK-NEXT:    [[BL:%.*]] = load i32, i32* [[ARG1]], align 4
+; CHECK-NEXT:    [[CL:%.*]] = load i32, i32* [[ARG2]], align 4
+
+!0 = !DIFile(filename: "foo.c", directory: "/tmp")
+!1 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!5 = distinct !DICompileUnit(language: DW_LANG_C, file: !0, producer: "My Compiler", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !6, retainedTypes: !6, globals: !7)
+!6 = !{}
+!7 = !{}
+!11 = distinct !DISubprogram(name: "func_5", scope: !0, file: !0, line: 117, type: !12, isLocal: true, isDefinition: true, scopeLine: 118, isOptimized: false, unit: !5, retainedNodes: !6)
+!12 = !DISubroutineType(types: !13)
+!13 = !{}
+!14 = !DILocalVariable(name: "p_6", arg: 1, scope: !11, line: 117, type: !1)
--- a/test/Transforms/IROutliner/outlining-different-constants.ll
+++ b/test/Transforms/IROutliner/outlining-different-constants.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost< %s | FileCheck %s

 ; This test looks at the constants in the regions, and if it they are the
 ; differents it elevates the constants to arguments.
--- a/test/Transforms/IROutliner/outlining-different-globals.ll
+++ b/test/Transforms/IROutliner/outlining-different-globals.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test looks at the globals in the regions, and makes sure they are not
 ; outlined if they are different values.
--- a/test/Transforms/IROutliner/outlining-different-output-blocks.ll
+++ b/test/Transforms/IROutliner/outlining-different-output-blocks.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -iroutliner < %s | FileCheck %s
+; RUN: opt -S -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; These functions are constructed slightly differently so that they require
 ; different output blocks for the values used outside of the region. We are
--- a/test/Transforms/IROutliner/outlining-different-structure.ll
+++ b/test/Transforms/IROutliner/outlining-different-structure.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner --ir-outlining-no-cost < %s | FileCheck %s

 ; This is a negative case to show that when we have the same set of
 ; instructions, but in a different order, they are not outlined in the same way.
--- a/test/Transforms/IROutliner/outlining-remapped-outputs.ll
+++ b/test/Transforms/IROutliner/outlining-remapped-outputs.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -iroutliner < %s | FileCheck %s
+; RUN: opt -S -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test tests that inputs that are replaced with the output of an outlined
 ; function is still recognized as the same value.
--- a/test/Transforms/IROutliner/outlining-same-constants.ll
+++ b/test/Transforms/IROutliner/outlining-same-constants.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; This test looks at the constants in the regions, and if it they are the
 ; same it outlines them as constants rather than elevating them to arguments.
--- a/test/Transforms/IROutliner/outlining-same-globals.ll
+++ b/test/Transforms/IROutliner/outlining-same-globals.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -verify -iroutliner < %s | FileCheck %s
+; RUN: opt -S -verify -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

@global1 = global i32 1, align 4
@global2 = global i32 2, align 4
--- a/test/Transforms/IROutliner/outlining-same-output-blocks.ll
+++ b/test/Transforms/IROutliner/outlining-same-output-blocks.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -iroutliner < %s | FileCheck %s
+; RUN: opt -S -iroutliner -ir-outlining-no-cost < %s | FileCheck %s

 ; These functions are constructed slightly differently so that they require
 ; the same output blocks for the values used outside of the region. We are