diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h index e7db8fd421f..828ac004117 100644 --- a/include/llvm/Passes/PassBuilder.h +++ b/include/llvm/Passes/PassBuilder.h @@ -143,11 +143,26 @@ public: /// /// This enumerates the LLVM-provided high-level optimization levels. Each /// level has a specific goal and rationale. - enum OptimizationLevel { + class OptimizationLevel final { + unsigned SpeedLevel = 2; + unsigned SizeLevel = 0; + OptimizationLevel(unsigned SpeedLevel, unsigned SizeLevel) + : SpeedLevel(SpeedLevel), SizeLevel(SizeLevel) { + // Check that only valid combinations are passed. + assert(SpeedLevel <= 3 && + "Optimization level for speed should be 0, 1, 2, or 3"); + assert(SizeLevel <= 2 && + "Optimization level for size should be 0, 1, or 2"); + assert((SizeLevel == 0 || SpeedLevel == 2) && + "Optimize for size should be encoded with speedup level == 2"); + } + + public: + OptimizationLevel() = default; /// Disable as many optimizations as possible. This doesn't completely /// disable the optimizer in all cases, for example always_inline functions /// can be required to be inlined for correctness. - O0, + static const OptimizationLevel O0; /// Optimize quickly without destroying debuggability. /// @@ -161,10 +176,9 @@ public: /// /// As an example, complex loop transformations such as versioning, /// vectorization, or fusion don't make sense here due to the degree to - /// which the executed code differs from the source code, and the compile time - /// cost. - O1, - + /// which the executed code differs from the source code, and the compile + /// time cost. + static const OptimizationLevel O1; /// Optimize for fast execution as much as possible without triggering /// significant incremental compile time or code size growth. /// @@ -181,8 +195,7 @@ public: /// /// This is expected to be a good default optimization level for the vast /// majority of users. - O2, - + static const OptimizationLevel O2; /// Optimize for fast execution as much as possible. /// /// This mode is significantly more aggressive in trading off compile time @@ -197,8 +210,7 @@ public: /// order to make even significantly slower compile times at least scale /// reasonably. This does not preclude very substantial constant factor /// costs though. - O3, - + static const OptimizationLevel O3; /// Similar to \c O2 but tries to optimize for small code size instead of /// fast execution without triggering significant incremental execution /// time slowdowns. @@ -209,8 +221,7 @@ public: /// A consequence of the different core goal is that this should in general /// produce substantially smaller executables that still run in /// a reasonable amount of time. - Os, - + static const OptimizationLevel Os; /// A very specialized mode that will optimize for code size at any and all /// costs. /// @@ -218,7 +229,24 @@ public: /// any effort taken to reduce the size is worth it regardless of the /// execution time impact. You should expect this level to produce rather /// slow, but very small, code. - Oz + static const OptimizationLevel Oz; + + bool isOptimizingForSpeed() const { + return SizeLevel == 0 && SpeedLevel > 0; + } + + bool isOptimizingForSize() const { return SizeLevel > 0; } + + bool operator==(const OptimizationLevel &Other) const { + return SizeLevel == Other.SizeLevel && SpeedLevel == Other.SpeedLevel; + } + bool operator!=(const OptimizationLevel &Other) const { + return SizeLevel != Other.SizeLevel || SpeedLevel != Other.SpeedLevel; + } + + unsigned getSpeedupLevel() const { return SpeedLevel; } + + unsigned getSizeLevel() const { return SizeLevel; } }; explicit PassBuilder(TargetMachine *TM = nullptr, diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp index dcde7277b82..3cd84ad1dac 100644 --- a/lib/LTO/LTOBackend.cpp +++ b/lib/LTO/LTOBackend.cpp @@ -203,16 +203,16 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM, default: llvm_unreachable("Invalid optimization level"); case 0: - OL = PassBuilder::O0; + OL = PassBuilder::OptimizationLevel::O0; break; case 1: - OL = PassBuilder::O1; + OL = PassBuilder::OptimizationLevel::O1; break; case 2: - OL = PassBuilder::O2; + OL = PassBuilder::OptimizationLevel::O2; break; case 3: - OL = PassBuilder::O3; + OL = PassBuilder::OptimizationLevel::O3; break; } diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp index 53b7db8689c..a65ea0d9e37 100644 --- a/lib/Passes/PassBuilder.cpp +++ b/lib/Passes/PassBuilder.cpp @@ -244,20 +244,24 @@ extern cl::opt EnableOrderFileInstrumentation; extern cl::opt FlattenedProfileUsed; -static bool isOptimizingForSize(PassBuilder::OptimizationLevel Level) { - switch (Level) { - case PassBuilder::O0: - case PassBuilder::O1: - case PassBuilder::O2: - case PassBuilder::O3: - return false; - - case PassBuilder::Os: - case PassBuilder::Oz: - return true; - } - llvm_unreachable("Invalid optimization level!"); -} +const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O0 = { + /*SpeedLevel*/ 0, + /*SizeLevel*/ 0}; +const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O1 = { + /*SpeedLevel*/ 1, + /*SizeLevel*/ 0}; +const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O2 = { + /*SpeedLevel*/ 2, + /*SizeLevel*/ 0}; +const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O3 = { + /*SpeedLevel*/ 3, + /*SizeLevel*/ 0}; +const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::Os = { + /*SpeedLevel*/ 2, + /*SizeLevel*/ 1}; +const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::Oz = { + /*SpeedLevel*/ 2, + /*SizeLevel*/ 2}; namespace { @@ -396,7 +400,7 @@ FunctionPassManager PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, ThinLTOPhase Phase, bool DebugLogging) { - assert(Level != O0 && "Must request optimizations!"); + assert(Level != OptimizationLevel::O0 && "Must request optimizations!"); FunctionPassManager FPM(DebugLogging); // Form SSA out of local memory accesses after breaking apart aggregates into @@ -407,7 +411,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */)); // Hoisting of scalars and load expressions. - if (Level > O1) { + if (Level.getSpeedupLevel() > 1) { if (EnableGVNHoist) FPM.addPass(GVNHoistPass()); @@ -419,7 +423,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, } // Speculative execution if the target has divergent branches; otherwise nop. - if (Level > O1) { + if (Level.getSpeedupLevel() > 1) { FPM.addPass(SpeculativeExecutionPass()); // Optimize based on known information about branches, and cleanup afterward. @@ -427,11 +431,11 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(CorrelatedValuePropagationPass()); } FPM.addPass(SimplifyCFGPass()); - if (Level == O3) + if (Level == OptimizationLevel::O3) FPM.addPass(AggressiveInstCombinePass()); FPM.addPass(InstCombinePass()); - if (!isOptimizingForSize(Level)) + if (!Level.isOptimizingForSize()) FPM.addPass(LibCallsShrinkWrapPass()); invokePeepholeEPCallbacks(FPM, Level); @@ -439,11 +443,11 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, // For PGO use pipeline, try to optimize memory intrinsics such as memcpy // using the size value profile. Don't perform this when optimizing for size. if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse && - !isOptimizingForSize(Level) && Level > O1) + (Level.getSpeedupLevel() > 1 && !Level.isOptimizingForSize())) FPM.addPass(PGOMemOPSizeOpt()); // TODO: Investigate the cost/benefit of tail call elimination on debugging. - if (Level > O1) + if (Level.getSpeedupLevel() > 1) FPM.addPass(TailCallElimPass()); FPM.addPass(SimplifyCFGPass()); @@ -470,7 +474,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, LPM1.addPass(LoopSimplifyCFGPass()); // Rotate Loop - disable header duplication at -Oz - LPM1.addPass(LoopRotatePass(Level != Oz)); + LPM1.addPass(LoopRotatePass(Level != OptimizationLevel::Oz)); // TODO: Investigate promotion cap for O1. LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); LPM1.addPass(SimpleLoopUnswitchPass()); @@ -487,7 +491,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, if ((Phase != ThinLTOPhase::PreLink || !PGOOpt || PGOOpt->Action != PGOOptions::SampleUse) && PTO.LoopUnrolling) - LPM2.addPass(LoopFullUnrollPass(Level, /*OnlyWhenForced=*/false, + LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(), + /*OnlyWhenForced=*/false, PTO.ForgetAllSCEVInLoopUnroll)); for (auto &C : LoopOptimizerEndEPCallbacks) @@ -510,7 +515,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(SROA()); // Eliminate redundancies. - if (Level != O1) { + if (Level != OptimizationLevel::O1) { // These passes add substantial compile time so skip them at O1. FPM.addPass(MergedLoadStoreMotionPass()); if (RunNewGVN) @@ -539,7 +544,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, // Re-consider control flow based optimizations after redundancy elimination, // redo DCE, etc. - if (Level > O1) { + if (Level.getSpeedupLevel() > 1) { FPM.addPass(JumpThreadingPass()); FPM.addPass(CorrelatedValuePropagationPass()); FPM.addPass(DSEPass()); @@ -559,7 +564,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level, FPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(FPM, Level); - if (EnableCHR && Level == O3 && PGOOpt && + if (EnableCHR && Level == OptimizationLevel::O3 && PGOOpt && (PGOOpt->Action == PGOOptions::IRUse || PGOOpt->Action == PGOOptions::SampleUse)) FPM.addPass(ControlHeightReductionPass()); @@ -572,13 +577,13 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging, bool RunProfileGen, bool IsCS, std::string ProfileFile, std::string ProfileRemappingFile) { - assert(Level != O0 && "Not expecting O0 here!"); + assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!"); // Generally running simplification passes and the inliner with an high // threshold results in smaller executables, but there may be cases where // the size grows, so let's be conservative here and skip this simplification // at -Os/Oz. We will not do this inline for context sensistive PGO (when // IsCS is true). - if (!isOptimizingForSize(Level) && !IsCS) { + if (!Level.isOptimizingForSize() && !IsCS) { InlineParams IP; IP.DefaultThreshold = PreInlineThreshold; @@ -663,10 +668,7 @@ void PassBuilder::addPGOInstrPassesForO0(ModulePassManager &MPM, static InlineParams getInlineParamsFromOptLevel(PassBuilder::OptimizationLevel Level) { - auto O3 = PassBuilder::O3; - unsigned OptLevel = Level > O3 ? 2 : Level; - unsigned SizeLevel = Level > O3 ? Level - O3 : 0; - return getInlineParams(OptLevel, SizeLevel); + return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel()); } ModulePassManager @@ -712,7 +714,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, EarlyFPM.addPass(SROA()); EarlyFPM.addPass(EarlyCSEPass()); EarlyFPM.addPass(LowerExpectIntrinsicPass()); - if (Level == O3) + if (Level == OptimizationLevel::O3) EarlyFPM.addPass(CallSiteSplittingPass()); // In SamplePGO ThinLTO backend, we need instcombine before profile annotation @@ -831,7 +833,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level, // When at O3 add argument promotion to the pass pipeline. // FIXME: It isn't at all clear why this should be limited to O3. - if (Level == O3) + if (Level == OptimizationLevel::O3) MainCGPipeline.addPass(ArgumentPromotionPass()); // Lastly, add the core function simplification pipeline nested inside the @@ -975,11 +977,11 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline( // across the loop nests. // We do UnrollAndJam in a separate LPM to ensure it happens before unroll if (EnableUnrollAndJam && PTO.LoopUnrolling) { - OptimizePM.addPass(LoopUnrollAndJamPass(Level)); + OptimizePM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel())); } - OptimizePM.addPass(LoopUnrollPass( - LoopUnrollOptions(Level, /*OnlyWhenForced=*/!PTO.LoopUnrolling, - PTO.ForgetAllSCEVInLoopUnroll))); + OptimizePM.addPass(LoopUnrollPass(LoopUnrollOptions( + Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling, + PTO.ForgetAllSCEVInLoopUnroll))); OptimizePM.addPass(WarnMissedTransformationsPass()); OptimizePM.addPass(InstCombinePass()); OptimizePM.addPass(RequireAnalysisPass()); @@ -1041,7 +1043,8 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline( ModulePassManager PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, bool DebugLogging, bool LTOPreLink) { - assert(Level != O0 && "Must request optimizations for the default pipeline!"); + assert(Level != OptimizationLevel::O0 && + "Must request optimizations for the default pipeline!"); ModulePassManager MPM(DebugLogging); @@ -1068,7 +1071,8 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level, ModulePassManager PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level, bool DebugLogging) { - assert(Level != O0 && "Must request optimizations for the default pipeline!"); + assert(Level != OptimizationLevel::O0 && + "Must request optimizations for the default pipeline!"); ModulePassManager MPM(DebugLogging); @@ -1129,7 +1133,7 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline( MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary)); } - if (Level == O0) + if (Level == OptimizationLevel::O0) return MPM; // Force any function attributes we want the rest of the pipeline to observe. @@ -1148,10 +1152,11 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline( ModulePassManager PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level, bool DebugLogging) { - assert(Level != O0 && "Must request optimizations for the default pipeline!"); + assert(Level != OptimizationLevel::O0 && + "Must request optimizations for the default pipeline!"); // FIXME: We should use a customized pre-link pipeline! return buildPerModuleDefaultPipeline(Level, DebugLogging, - /* LTOPreLink */true); + /* LTOPreLink */ true); } ModulePassManager @@ -1159,7 +1164,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging, ModuleSummaryIndex *ExportSummary) { ModulePassManager MPM(DebugLogging); - if (Level == O0) { + if (Level == OptimizationLevel::O0) { // The WPD and LowerTypeTest passes need to run at -O0 to lower type // metadata and intrinsics. MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); @@ -1188,7 +1193,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging, // libraries and other oracles. MPM.addPass(InferFunctionAttrsPass()); - if (Level > 1) { + if (Level.getSpeedupLevel() > 1) { FunctionPassManager EarlyFPM(DebugLogging); EarlyFPM.addPass(CallSiteSplittingPass()); MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM))); @@ -1226,7 +1231,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging, MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr)); // Stop here at -O1. - if (Level == 1) { + if (Level == OptimizationLevel::O1) { // The LowerTypeTestsPass needs to run to lower type metadata and the // type.test intrinsics. The pass does nothing if CFI is disabled. MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr)); @@ -1251,7 +1256,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging, // function pointers. When this happens, we often have to resolve varargs // calls, etc, so let instcombine do this. FunctionPassManager PeepholeFPM(DebugLogging); - if (Level == O3) + if (Level == OptimizationLevel::O3) PeepholeFPM.addPass(AggressiveInstCombinePass()); PeepholeFPM.addPass(InstCombinePass()); invokePeepholeEPCallbacks(PeepholeFPM, Level); @@ -1887,13 +1892,13 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM, assert(Matches.size() == 3 && "Must capture two matched strings!"); OptimizationLevel L = StringSwitch(Matches[2]) - .Case("O0", O0) - .Case("O1", O1) - .Case("O2", O2) - .Case("O3", O3) - .Case("Os", Os) - .Case("Oz", Oz); - if (L == O0) { + .Case("O0", OptimizationLevel::O0) + .Case("O1", OptimizationLevel::O1) + .Case("O2", OptimizationLevel::O2) + .Case("O3", OptimizationLevel::O3) + .Case("Os", OptimizationLevel::Os) + .Case("Oz", OptimizationLevel::Oz); + if (L == OptimizationLevel::O0) { // Add instrumentation PGO passes -- at O0 we can still do PGO. if (PGOOpt && Matches[1] != "thinlto" && (PGOOpt->Action == PGOOptions::IRInstr || @@ -1910,8 +1915,10 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM, // This is consistent with old pass manager invoked via opt, but // inconsistent with clang. Clang doesn't enable loop vectorization // but does enable slp vectorization at Oz. - PTO.LoopVectorization = L > O1 && L < Oz; - PTO.SLPVectorization = L > O1 && L < Oz; + PTO.LoopVectorization = + L.getSpeedupLevel() > 1 && L != OptimizationLevel::Oz; + PTO.SLPVectorization = + L.getSpeedupLevel() > 1 && L != OptimizationLevel::Oz; if (Matches[1] == "default") { MPM.addPass(buildPerModuleDefaultPipeline(L, DebugLogging)); diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index bb0459bfafc..a3994eab38a 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -171,6 +171,16 @@ static cl::opt UnrollRevisitChildLoops( "This shouldn't typically be needed as child loops (or their " "clones) were already visited.")); +static cl::opt UnrollThresholdAggressive( + "unroll-threshold-aggressive", cl::init(300), cl::Hidden, + cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) " + "optimizations")); +static cl::opt + UnrollThresholdDefault("unroll-threshold-default", cl::init(150), + cl::Hidden, + cl::desc("Default threshold (max size of unrolled " + "loop), used in all but O3 optimizations")); + /// A magic value for use with the Threshold parameter to indicate /// that the loop unroll should be performed regardless of how much /// code expansion would result. @@ -189,7 +199,8 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences( TargetTransformInfo::UnrollingPreferences UP; // Set up the defaults - UP.Threshold = OptLevel > 2 ? 300 : 150; + UP.Threshold = + OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault; UP.MaxPercentThresholdBoost = 400; UP.OptSizeThreshold = 0; UP.PartialThreshold = 150; diff --git a/test/Transforms/LoopUnroll/opt-levels.ll b/test/Transforms/LoopUnroll/opt-levels.ll new file mode 100644 index 00000000000..ed0abc7672e --- /dev/null +++ b/test/Transforms/LoopUnroll/opt-levels.ll @@ -0,0 +1,47 @@ +; RUN: opt < %s -S -passes="default" -unroll-runtime=true -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=O2 +; RUN: opt < %s -S -passes="default" -unroll-runtime=true -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=O3 +; RUN: opt < %s -S -passes="default" -unroll-runtime=true -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=Os +; RUN: opt < %s -S -passes="default" -unroll-runtime=true -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=Oz + +; Check that Os and Oz are optimized like O2, not like O3. To easily highlight +; the behavior, we artificially disable unrolling for anything but O3 by setting +; the default threshold to 0. + +; O3: loop2.preheader +; O2-NOT: loop2.preheader +; Os-NOT: loop2.preheader +; Oz-NOT: loop2.preheader + +define void @unroll(i32 %iter, i32* %addr1, i32* %addr2) nounwind { +entry: + br label %loop1 + +loop1: + %iv1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch ] + %offset1 = getelementptr i32, i32* %addr1, i32 %iv1 + store i32 %iv1, i32* %offset1, align 4 + br label %loop2.header + +loop2.header: + %e = icmp uge i32 %iter, 1 + br i1 %e, label %loop2, label %exit2 + +loop2: + %iv2 = phi i32 [ 0, %loop2.header ], [ %inc2, %loop2 ] + %offset2 = getelementptr i32, i32* %addr2, i32 %iv2 + store i32 %iv2, i32* %offset2, align 4 + %inc2 = add i32 %iv2, 1 + %exitcnd2 = icmp uge i32 %inc2, %iter + br i1 %exitcnd2, label %exit2, label %loop2 + +exit2: + br label %loop1.latch + +loop1.latch: + %inc1 = add i32 %iv1, 1 + %exitcnd1 = icmp uge i32 %inc1, 1024 + br i1 %exitcnd1, label %exit, label %loop1 + +exit: + ret void +} diff --git a/test/Transforms/LoopUnrollAndJam/opt-levels.ll b/test/Transforms/LoopUnrollAndJam/opt-levels.ll new file mode 100644 index 00000000000..ef6d2c7502f --- /dev/null +++ b/test/Transforms/LoopUnrollAndJam/opt-levels.ll @@ -0,0 +1,61 @@ +; RUN: opt < %s -S -passes="default" -unroll-runtime=true -enable-npm-unroll-and-jam -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=O2 +; RUN: opt < %s -S -passes="default" -unroll-runtime=true -enable-npm-unroll-and-jam -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=O3 +; RUN: opt < %s -S -passes="default" -unroll-runtime=true -enable-npm-unroll-and-jam -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=Os +; RUN: opt < %s -S -passes="default" -unroll-runtime=true -enable-npm-unroll-and-jam -unroll-threshold-default=0 -unroll-threshold-aggressive=300 | FileCheck %s -check-prefix=Oz + +; Check that Os and Oz are optimized like O2, not like O3. To easily highlight +; the behavior, we artificially disable unrolling for anything but O3 by setting +; the default threshold to 0. + +; O3: for.inner.1 +; O2-NOT: for.inner.1 +; Os-NOT: for.inner.1 +; Oz-NOT: for.inner.1 + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + +define void @test1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 { +entry: + %cmp = icmp ne i32 %J, 0 + %cmpJ = icmp ne i32 %I, 0 + %or.cond = and i1 %cmp, %cmpJ + br i1 %or.cond, label %for.outer.preheader, label %for.end + +for.outer.preheader: + br label %for.outer + +for.outer: + %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ] + br label %for.inner + +for.inner: + %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ] + %sum = phi i32 [ 0, %for.outer ], [ %add, %for.inner ] + %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j + %0 = load i32, i32* %arrayidx, align 4, !tbaa !5 + %add = add i32 %0, %sum + %inc = add nuw i32 %j, 1 + %exitcond = icmp eq i32 %inc, %J + br i1 %exitcond, label %for.latch, label %for.inner + +for.latch: + %add.lcssa = phi i32 [ %add, %for.inner ] + %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i + store i32 %add.lcssa, i32* %arrayidx6, align 4, !tbaa !5 + %add8 = add nuw i32 %i, 1 + %exitcond25 = icmp eq i32 %add8, %I + br i1 %exitcond25, label %for.end.loopexit, label %for.outer + +for.end.loopexit: + br label %for.end + +for.end: + ret void +} + + + +!5 = !{!6, !6, i64 0} +!6 = !{!"int", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"}