[llvm] Make new pass manager's OptimizationLevel a class

Summary: The old pass manager separated speed optimization and size optimization levels into two unsigned values. Coallescing both in an enum in the new pass manager may lead to unintentional casts and comparisons. In particular, taking a look at how the loop unroll passes were constructed previously, the Os/Oz are now (==new pass manager) treated just like O3, likely unintentionally. This change disallows raw comparisons between optimization levels, to avoid such unintended effects. As an effect, the O{s|z} behavior changes for loop unrolling and loop unroll and jam, matching O2 rather than O3. The change also parameterizes the threshold values used for loop unrolling, primarily to aid testing. Reviewers: tejohnson, davidxl Reviewed By: tejohnson Subscribers: zzheng, ychen, mehdi_amini, hiraditya, steven_wu, dexonsmith, dang, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D72547
2025-01-31 12:41:49 +01:00 · 2020-01-16 08:51:50 -08:00 · 2020-01-16 08:51:50 -08:00 · e90406ee2a
commit e90406ee2a
parent bbf89c711e
6 changed files with 228 additions and 74 deletions
--- a/include/llvm/Passes/PassBuilder.h
+++ b/include/llvm/Passes/PassBuilder.h
@ -143,11 +143,26 @@ public:
  ///
  /// This enumerates the LLVM-provided high-level optimization levels. Each
  /// level has a specific goal and rationale.
-  enum OptimizationLevel {
+  class OptimizationLevel final {
+    unsigned SpeedLevel = 2;
+    unsigned SizeLevel = 0;
+    OptimizationLevel(unsigned SpeedLevel, unsigned SizeLevel)
+        : SpeedLevel(SpeedLevel), SizeLevel(SizeLevel) {
+      // Check that only valid combinations are passed.
+      assert(SpeedLevel <= 3 &&
+             "Optimization level for speed should be 0, 1, 2, or 3");
+      assert(SizeLevel <= 2 &&
+             "Optimization level for size should be 0, 1, or 2");
+      assert((SizeLevel == 0 || SpeedLevel == 2) &&
+             "Optimize for size should be encoded with speedup level == 2");
+    }
+
+  public:
+    OptimizationLevel() = default;
    /// Disable as many optimizations as possible. This doesn't completely
    /// disable the optimizer in all cases, for example always_inline functions
    /// can be required to be inlined for correctness.
-    O0,
+    static const OptimizationLevel O0;

    /// Optimize quickly without destroying debuggability.
    ///
@ -161,10 +176,9 @@ public:
    ///
    /// As an example, complex loop transformations such as versioning,
    /// vectorization, or fusion don't make sense here due to the degree to
-    /// which the executed code differs from the source code, and the compile time
-    /// cost.
-    O1,
-
+    /// which the executed code differs from the source code, and the compile
+    /// time cost.
+    static const OptimizationLevel O1;
    /// Optimize for fast execution as much as possible without triggering
    /// significant incremental compile time or code size growth.
    ///
@ -181,8 +195,7 @@ public:
    ///
    /// This is expected to be a good default optimization level for the vast
    /// majority of users.
-    O2,
-
+    static const OptimizationLevel O2;
    /// Optimize for fast execution as much as possible.
    ///
    /// This mode is significantly more aggressive in trading off compile time
@ -197,8 +210,7 @@ public:
    /// order to make even significantly slower compile times at least scale
    /// reasonably. This does not preclude very substantial constant factor
    /// costs though.
-    O3,
-
+    static const OptimizationLevel O3;
    /// Similar to \c O2 but tries to optimize for small code size instead of
    /// fast execution without triggering significant incremental execution
    /// time slowdowns.
@ -209,8 +221,7 @@ public:
    /// A consequence of the different core goal is that this should in general
    /// produce substantially smaller executables that still run in
    /// a reasonable amount of time.
-    Os,
-
+    static const OptimizationLevel Os;
    /// A very specialized mode that will optimize for code size at any and all
    /// costs.
    ///
@ -218,7 +229,24 @@ public:
    /// any effort taken to reduce the size is worth it regardless of the
    /// execution time impact. You should expect this level to produce rather
    /// slow, but very small, code.
-    Oz
+    static const OptimizationLevel Oz;
+
+    bool isOptimizingForSpeed() const {
+      return SizeLevel == 0 && SpeedLevel > 0;
+    }
+
+    bool isOptimizingForSize() const { return SizeLevel > 0; }
+
+    bool operator==(const OptimizationLevel &Other) const {
+      return SizeLevel == Other.SizeLevel && SpeedLevel == Other.SpeedLevel;
+    }
+    bool operator!=(const OptimizationLevel &Other) const {
+      return SizeLevel != Other.SizeLevel || SpeedLevel != Other.SpeedLevel;
+    }
+
+    unsigned getSpeedupLevel() const { return SpeedLevel; }
+
+    unsigned getSizeLevel() const { return SizeLevel; }
  };

  explicit PassBuilder(TargetMachine *TM = nullptr,
--- a/lib/LTO/LTOBackend.cpp
+++ b/lib/LTO/LTOBackend.cpp
@ -203,16 +203,16 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
  default:
    llvm_unreachable("Invalid optimization level");
  case 0:
-    OL = PassBuilder::O0;
+    OL = PassBuilder::OptimizationLevel::O0;
    break;
  case 1:
-    OL = PassBuilder::O1;
+    OL = PassBuilder::OptimizationLevel::O1;
    break;
  case 2:
-    OL = PassBuilder::O2;
+    OL = PassBuilder::OptimizationLevel::O2;
    break;
  case 3:
-    OL = PassBuilder::O3;
+    OL = PassBuilder::OptimizationLevel::O3;
    break;
  }

--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@ -244,20 +244,24 @@ extern cl::opt<bool> EnableOrderFileInstrumentation;

 extern cl::opt<bool> FlattenedProfileUsed;

-static bool isOptimizingForSize(PassBuilder::OptimizationLevel Level) {
-  switch (Level) {
-  case PassBuilder::O0:
-  case PassBuilder::O1:
-  case PassBuilder::O2:
-  case PassBuilder::O3:
-    return false;
-
-  case PassBuilder::Os:
-  case PassBuilder::Oz:
-    return true;
-  }
-  llvm_unreachable("Invalid optimization level!");
-}
+const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O0 = {
+    /*SpeedLevel*/ 0,
+    /*SizeLevel*/ 0};
+const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O1 = {
+    /*SpeedLevel*/ 1,
+    /*SizeLevel*/ 0};
+const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O2 = {
+    /*SpeedLevel*/ 2,
+    /*SizeLevel*/ 0};
+const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O3 = {
+    /*SpeedLevel*/ 3,
+    /*SizeLevel*/ 0};
+const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::Os = {
+    /*SpeedLevel*/ 2,
+    /*SizeLevel*/ 1};
+const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::Oz = {
+    /*SpeedLevel*/ 2,
+    /*SizeLevel*/ 2};

 namespace {

@ -396,7 +400,7 @@ FunctionPassManager
 PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
                                                 ThinLTOPhase Phase,
                                                 bool DebugLogging) {
-  assert(Level != O0 && "Must request optimizations!");
+  assert(Level != OptimizationLevel::O0 && "Must request optimizations!");
  FunctionPassManager FPM(DebugLogging);

  // Form SSA out of local memory accesses after breaking apart aggregates into
@ -407,7 +411,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
  FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));

  // Hoisting of scalars and load expressions.
-  if (Level > O1) {
+  if (Level.getSpeedupLevel() > 1) {
    if (EnableGVNHoist)
      FPM.addPass(GVNHoistPass());

@ -419,7 +423,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
  }

  // Speculative execution if the target has divergent branches; otherwise nop.
-  if (Level > O1) {
+  if (Level.getSpeedupLevel() > 1) {
    FPM.addPass(SpeculativeExecutionPass());

    // Optimize based on known information about branches, and cleanup afterward.
@ -427,11 +431,11 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
    FPM.addPass(CorrelatedValuePropagationPass());
  }
  FPM.addPass(SimplifyCFGPass());
-  if (Level == O3)
+  if (Level == OptimizationLevel::O3)
    FPM.addPass(AggressiveInstCombinePass());
  FPM.addPass(InstCombinePass());

-  if (!isOptimizingForSize(Level))
+  if (!Level.isOptimizingForSize())
    FPM.addPass(LibCallsShrinkWrapPass());

  invokePeepholeEPCallbacks(FPM, Level);
@ -439,11 +443,11 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
  // For PGO use pipeline, try to optimize memory intrinsics such as memcpy
  // using the size value profile. Don't perform this when optimizing for size.
  if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse &&
-      !isOptimizingForSize(Level) && Level > O1)
+      (Level.getSpeedupLevel() > 1 && !Level.isOptimizingForSize()))
    FPM.addPass(PGOMemOPSizeOpt());

  // TODO: Investigate the cost/benefit of tail call elimination on debugging.
-  if (Level > O1)
+  if (Level.getSpeedupLevel() > 1)
    FPM.addPass(TailCallElimPass());
  FPM.addPass(SimplifyCFGPass());

@ -470,7 +474,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
  LPM1.addPass(LoopSimplifyCFGPass());

  // Rotate Loop - disable header duplication at -Oz
-  LPM1.addPass(LoopRotatePass(Level != Oz));
+  LPM1.addPass(LoopRotatePass(Level != OptimizationLevel::Oz));
  // TODO: Investigate promotion cap for O1.
  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
  LPM1.addPass(SimpleLoopUnswitchPass());
@ -487,7 +491,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
  if ((Phase != ThinLTOPhase::PreLink || !PGOOpt ||
       PGOOpt->Action != PGOOptions::SampleUse) &&
      PTO.LoopUnrolling)
-    LPM2.addPass(LoopFullUnrollPass(Level, /*OnlyWhenForced=*/false,
+    LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
+                                    /*OnlyWhenForced=*/false,
                                    PTO.ForgetAllSCEVInLoopUnroll));

  for (auto &C : LoopOptimizerEndEPCallbacks)
@ -510,7 +515,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
  FPM.addPass(SROA());

  // Eliminate redundancies.
-  if (Level != O1) {
+  if (Level != OptimizationLevel::O1) {
    // These passes add substantial compile time so skip them at O1.
    FPM.addPass(MergedLoadStoreMotionPass());
    if (RunNewGVN)
@ -539,7 +544,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,

  // Re-consider control flow based optimizations after redundancy elimination,
  // redo DCE, etc.
-  if (Level > O1) {
+  if (Level.getSpeedupLevel() > 1) {
    FPM.addPass(JumpThreadingPass());
    FPM.addPass(CorrelatedValuePropagationPass());
    FPM.addPass(DSEPass());
@ -559,7 +564,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
  FPM.addPass(InstCombinePass());
  invokePeepholeEPCallbacks(FPM, Level);

-  if (EnableCHR && Level == O3 && PGOOpt &&
+  if (EnableCHR && Level == OptimizationLevel::O3 && PGOOpt &&
      (PGOOpt->Action == PGOOptions::IRUse ||
       PGOOpt->Action == PGOOptions::SampleUse))
    FPM.addPass(ControlHeightReductionPass());
@ -572,13 +577,13 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
                                    bool RunProfileGen, bool IsCS,
                                    std::string ProfileFile,
                                    std::string ProfileRemappingFile) {
-  assert(Level != O0 && "Not expecting O0 here!");
+  assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!");
  // Generally running simplification passes and the inliner with an high
  // threshold results in smaller executables, but there may be cases where
  // the size grows, so let's be conservative here and skip this simplification
  // at -Os/Oz. We will not do this  inline for context sensistive PGO (when
  // IsCS is true).
-  if (!isOptimizingForSize(Level) && !IsCS) {
+  if (!Level.isOptimizingForSize() && !IsCS) {
    InlineParams IP;

    IP.DefaultThreshold = PreInlineThreshold;
@ -663,10 +668,7 @@ void PassBuilder::addPGOInstrPassesForO0(ModulePassManager &MPM,

 static InlineParams
 getInlineParamsFromOptLevel(PassBuilder::OptimizationLevel Level) {
-  auto O3 = PassBuilder::O3;
-  unsigned OptLevel = Level > O3 ? 2 : Level;
-  unsigned SizeLevel = Level > O3 ? Level - O3 : 0;
-  return getInlineParams(OptLevel, SizeLevel);
+  return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel());
 }

 ModulePassManager
@ -712,7 +714,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
  EarlyFPM.addPass(SROA());
  EarlyFPM.addPass(EarlyCSEPass());
  EarlyFPM.addPass(LowerExpectIntrinsicPass());
-  if (Level == O3)
+  if (Level == OptimizationLevel::O3)
    EarlyFPM.addPass(CallSiteSplittingPass());

  // In SamplePGO ThinLTO backend, we need instcombine before profile annotation
@ -831,7 +833,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,

  // When at O3 add argument promotion to the pass pipeline.
  // FIXME: It isn't at all clear why this should be limited to O3.
-  if (Level == O3)
+  if (Level == OptimizationLevel::O3)
    MainCGPipeline.addPass(ArgumentPromotionPass());

  // Lastly, add the core function simplification pipeline nested inside the
@ -975,11 +977,11 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
  // across the loop nests.
  // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
  if (EnableUnrollAndJam && PTO.LoopUnrolling) {
-    OptimizePM.addPass(LoopUnrollAndJamPass(Level));
+    OptimizePM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel()));
  }
-  OptimizePM.addPass(LoopUnrollPass(
-      LoopUnrollOptions(Level, /*OnlyWhenForced=*/!PTO.LoopUnrolling,
-                        PTO.ForgetAllSCEVInLoopUnroll)));
+  OptimizePM.addPass(LoopUnrollPass(LoopUnrollOptions(
+      Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
+      PTO.ForgetAllSCEVInLoopUnroll)));
  OptimizePM.addPass(WarnMissedTransformationsPass());
  OptimizePM.addPass(InstCombinePass());
  OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
@ -1041,7 +1043,8 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
 ModulePassManager
 PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
                                           bool DebugLogging, bool LTOPreLink) {
-  assert(Level != O0 && "Must request optimizations for the default pipeline!");
+  assert(Level != OptimizationLevel::O0 &&
+         "Must request optimizations for the default pipeline!");

  ModulePassManager MPM(DebugLogging);

@ -1068,7 +1071,8 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
 ModulePassManager
 PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level,
                                                bool DebugLogging) {
-  assert(Level != O0 && "Must request optimizations for the default pipeline!");
+  assert(Level != OptimizationLevel::O0 &&
+         "Must request optimizations for the default pipeline!");

  ModulePassManager MPM(DebugLogging);

@ -1129,7 +1133,7 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
    MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary));
  }

-  if (Level == O0)
+  if (Level == OptimizationLevel::O0)
    return MPM;

  // Force any function attributes we want the rest of the pipeline to observe.
@ -1148,10 +1152,11 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
 ModulePassManager
 PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level,
                                            bool DebugLogging) {
-  assert(Level != O0 && "Must request optimizations for the default pipeline!");
+  assert(Level != OptimizationLevel::O0 &&
+         "Must request optimizations for the default pipeline!");
  // FIXME: We should use a customized pre-link pipeline!
  return buildPerModuleDefaultPipeline(Level, DebugLogging,
-                                       /* LTOPreLink */true);
+                                       /* LTOPreLink */ true);
 }

 ModulePassManager
@ -1159,7 +1164,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
                                     ModuleSummaryIndex *ExportSummary) {
  ModulePassManager MPM(DebugLogging);

-  if (Level == O0) {
+  if (Level == OptimizationLevel::O0) {
    // The WPD and LowerTypeTest passes need to run at -O0 to lower type
    // metadata and intrinsics.
    MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
@ -1188,7 +1193,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
  // libraries and other oracles.
  MPM.addPass(InferFunctionAttrsPass());

-  if (Level > 1) {
+  if (Level.getSpeedupLevel() > 1) {
    FunctionPassManager EarlyFPM(DebugLogging);
    EarlyFPM.addPass(CallSiteSplittingPass());
    MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM)));
@ -1226,7 +1231,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
  MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));

  // Stop here at -O1.
-  if (Level == 1) {
+  if (Level == OptimizationLevel::O1) {
    // The LowerTypeTestsPass needs to run to lower type metadata and the
    // type.test intrinsics. The pass does nothing if CFI is disabled.
    MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
@ -1251,7 +1256,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
  // function pointers.  When this happens, we often have to resolve varargs
  // calls, etc, so let instcombine do this.
  FunctionPassManager PeepholeFPM(DebugLogging);
-  if (Level == O3)
+  if (Level == OptimizationLevel::O3)
    PeepholeFPM.addPass(AggressiveInstCombinePass());
  PeepholeFPM.addPass(InstCombinePass());
  invokePeepholeEPCallbacks(PeepholeFPM, Level);
@ -1887,13 +1892,13 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
    assert(Matches.size() == 3 && "Must capture two matched strings!");

    OptimizationLevel L = StringSwitch<OptimizationLevel>(Matches[2])
-                              .Case("O0", O0)
-                              .Case("O1", O1)
-                              .Case("O2", O2)
-                              .Case("O3", O3)
-                              .Case("Os", Os)
-                              .Case("Oz", Oz);
-    if (L == O0) {
+                              .Case("O0", OptimizationLevel::O0)
+                              .Case("O1", OptimizationLevel::O1)
+                              .Case("O2", OptimizationLevel::O2)
+                              .Case("O3", OptimizationLevel::O3)
+                              .Case("Os", OptimizationLevel::Os)
+                              .Case("Oz", OptimizationLevel::Oz);
+    if (L == OptimizationLevel::O0) {
      // Add instrumentation PGO passes -- at O0 we can still do PGO.
      if (PGOOpt && Matches[1] != "thinlto" &&
          (PGOOpt->Action == PGOOptions::IRInstr ||
@ -1910,8 +1915,10 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
    // This is consistent with old pass manager invoked via opt, but
    // inconsistent with clang. Clang doesn't enable loop vectorization
    // but does enable slp vectorization at Oz.
-    PTO.LoopVectorization = L > O1 && L < Oz;
-    PTO.SLPVectorization = L > O1 && L < Oz;
+    PTO.LoopVectorization =
+        L.getSpeedupLevel() > 1 && L != OptimizationLevel::Oz;
+    PTO.SLPVectorization =
+        L.getSpeedupLevel() > 1 && L != OptimizationLevel::Oz;

    if (Matches[1] == "default") {
      MPM.addPass(buildPerModuleDefaultPipeline(L, DebugLogging));
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@ -171,6 +171,16 @@ static cl::opt<bool> UnrollRevisitChildLoops(
             "This shouldn't typically be needed as child loops (or their "
             "clones) were already visited."));

+static cl::opt<unsigned> UnrollThresholdAggressive(
+    "unroll-threshold-aggressive", cl::init(300), cl::Hidden,
+    cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
+             "optimizations"));
+static cl::opt<unsigned>
+    UnrollThresholdDefault("unroll-threshold-default", cl::init(150),
+                           cl::Hidden,
+                           cl::desc("Default threshold (max size of unrolled "
+                                    "loop), used in all but O3 optimizations"));
+
 /// A magic value for use with the Threshold parameter to indicate
 /// that the loop unroll should be performed regardless of how much
 /// code expansion would result.
@ -189,7 +199,8 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
  TargetTransformInfo::UnrollingPreferences UP;

  // Set up the defaults
-  UP.Threshold = OptLevel > 2 ? 300 : 150;
+  UP.Threshold =
+      OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault;
  UP.MaxPercentThresholdBoost = 400;
  UP.OptSizeThreshold = 0;
  UP.PartialThreshold = 150;
--- a/test/Transforms/LoopUnroll/opt-levels.ll
+++ b/test/Transforms/LoopUnroll/opt-levels.ll
@ -0,0 +1,47 @@
+; RUN: opt < %s -S -passes="default<O2>" -unroll-runtime=true -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=O2
+; RUN: opt < %s -S -passes="default<O3>" -unroll-runtime=true -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=O3
+; RUN: opt < %s -S -passes="default<Os>" -unroll-runtime=true -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=Os
+; RUN: opt < %s -S -passes="default<Oz>" -unroll-runtime=true -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=Oz
+
+; Check that Os and Oz are optimized like O2, not like O3. To easily highlight
+; the behavior, we artificially disable unrolling for anything but O3 by setting
+; the default threshold to 0.
+
+; O3:     loop2.preheader
+; O2-NOT: loop2.preheader
+; Os-NOT: loop2.preheader
+; Oz-NOT: loop2.preheader
+
+define void @unroll(i32 %iter, i32* %addr1, i32* %addr2) nounwind {
+entry:
+  br label %loop1
+
+loop1:
+  %iv1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch ]
+  %offset1 = getelementptr i32, i32* %addr1, i32 %iv1
+  store i32 %iv1, i32* %offset1, align 4
+  br label %loop2.header
+
+loop2.header:
+  %e = icmp uge i32 %iter, 1
+  br i1 %e, label %loop2, label %exit2
+
+loop2:
+  %iv2 = phi i32 [ 0, %loop2.header ], [ %inc2, %loop2 ]
+  %offset2 = getelementptr i32, i32* %addr2, i32 %iv2
+  store i32 %iv2, i32* %offset2, align 4
+  %inc2 = add i32 %iv2, 1
+  %exitcnd2 = icmp uge i32 %inc2, %iter
+  br i1 %exitcnd2, label %exit2, label %loop2
+
+exit2:
+  br label %loop1.latch
+
+loop1.latch:
+  %inc1 = add i32 %iv1, 1
+  %exitcnd1 = icmp uge i32 %inc1, 1024
+  br i1 %exitcnd1, label %exit, label %loop1
+
+exit:
+  ret void
+}
--- a/test/Transforms/LoopUnrollAndJam/opt-levels.ll
+++ b/test/Transforms/LoopUnrollAndJam/opt-levels.ll
@ -0,0 +1,61 @@
+; RUN: opt < %s -S -passes="default<O2>" -unroll-runtime=true -enable-npm-unroll-and-jam -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=O2
+; RUN: opt < %s -S -passes="default<O3>" -unroll-runtime=true -enable-npm-unroll-and-jam -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=O3
+; RUN: opt < %s -S -passes="default<Os>" -unroll-runtime=true -enable-npm-unroll-and-jam -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=Os
+; RUN: opt < %s -S -passes="default<Oz>" -unroll-runtime=true -enable-npm-unroll-and-jam -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=Oz
+
+; Check that Os and Oz are optimized like O2, not like O3. To easily highlight
+; the behavior, we artificially disable unrolling for anything but O3 by setting
+; the default threshold to 0.
+
+; O3:     for.inner.1
+; O2-NOT: for.inner.1
+; Os-NOT: for.inner.1
+; Oz-NOT: for.inner.1
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define void @test1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
+entry:
+  %cmp = icmp ne i32 %J, 0
+  %cmpJ = icmp ne i32 %I, 0
+  %or.cond = and i1 %cmp, %cmpJ
+  br i1 %or.cond, label %for.outer.preheader, label %for.end
+
+for.outer.preheader:
+  br label %for.outer
+
+for.outer:
+  %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
+  br label %for.inner
+
+for.inner:
+  %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
+  %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
+  %0 = load i32, i32* %arrayidx, align 4, !tbaa !5
+  %add = add i32 %0, %sum
+  %inc = add nuw i32 %j, 1
+  %exitcond = icmp eq i32 %inc, %J
+  br i1 %exitcond, label %for.latch, label %for.inner
+
+for.latch:
+  %add.lcssa = phi i32 [ %add, %for.inner ]
+  %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
+  store i32 %add.lcssa, i32* %arrayidx6, align 4, !tbaa !5
+  %add8 = add nuw i32 %i, 1
+  %exitcond25 = icmp eq i32 %add8, %I
+  br i1 %exitcond25, label %for.end.loopexit, label %for.outer
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+
+
+!5 = !{!6, !6, i64 0}
+!6 = !{!"int", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}