[PassManager] add helper function to hold set of vector passes (2nd try)

This is better no-functional-change-intended than the 1st attempt. As noted in D102002, there were at least 2 diffs that went unchecked in pass manager regressions tests: different pass parameters (SimplifyCFG) and an extension point/callback. Those should be lifted from the original code blocks correctly now.
2024-11-22 18:54:02 +01:00 · 2021-05-10 13:55:42 -04:00 · 2021-05-10 13:55:42 -04:00 · 971fe30acc
commit 971fe30acc
parent 45aaf991af
4 changed files with 248 additions and 224 deletions
--- a/include/llvm/Passes/PassBuilder.h
+++ b/include/llvm/Passes/PassBuilder.h
@ -709,6 +709,9 @@ private:
  void addRequiredLTOPreLinkPasses(ModulePassManager &MPM);
  void addVectorPasses(OptimizationLevel Level, FunctionPassManager &FPM,
                       bool IsLTO);
  static Optional<std::vector<PipelineElement>>
  parsePipelineText(StringRef Text);
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@ -218,7 +218,8 @@ private:
  void addLateLTOOptimizationPasses(legacy::PassManagerBase &PM);
  void addPGOInstrPasses(legacy::PassManagerBase &MPM, bool IsCS);
  void addFunctionSimplificationPasses(legacy::PassManagerBase &MPM);
-
+  void addVectorPasses(legacy::PassManagerBase &PM, bool IsLTO);
 public:
  /// populateFunctionPassManager - This fills in the function pass manager,
  /// which is expected to be run on each function immediately as it is
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@ -1201,6 +1201,127 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
  return MPM;
 }
 /// TODO: Should LTO cause any differences to this set of passes?
 void PassBuilder::addVectorPasses(OptimizationLevel Level,
                                  FunctionPassManager &FPM, bool IsLTO) {
  FPM.addPass(LoopVectorizePass(
      LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
  if (IsLTO) {
    // The vectorizer may have significantly shortened a loop body; unroll
    // again. Unroll small loops to hide loop backedge latency and saturate any
    // parallel execution resources of an out-of-order processor. We also then
    // need to clean up redundancies and loop invariant code.
    // FIXME: It would be really good to use a loop-integrated instruction
    // combiner for cleanup here so that the unrolling and LICM can be pipelined
    // across the loop nests.
    // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
    if (EnableUnrollAndJam && PTO.LoopUnrolling)
      FPM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel()));
    FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
        Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
        PTO.ForgetAllSCEVInLoopUnroll)));
    FPM.addPass(WarnMissedTransformationsPass());
  }
  if (!IsLTO) {
    // Eliminate loads by forwarding stores from the previous iteration to loads
    // of the current iteration.
    FPM.addPass(LoopLoadEliminationPass());
  }
  // Cleanup after the loop optimization passes.
  FPM.addPass(InstCombinePass());
  if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
    // At higher optimization levels, try to clean up any runtime overlap and
    // alignment checks inserted by the vectorizer. We want to track correlated
    // runtime checks for two inner loops in the same outer loop, fold any
    // common computations, hoist loop-invariant aspects out of any outer loop,
    // and unswitch the runtime checks if possible. Once hoisted, we may have
    // dead (or speculatable) control flows or more combining opportunities.
    FPM.addPass(EarlyCSEPass());
    FPM.addPass(CorrelatedValuePropagationPass());
    FPM.addPass(InstCombinePass());
    LoopPassManager LPM;
    LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
    LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
                                       OptimizationLevel::O3));
    FPM.addPass(
        RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
    FPM.addPass(createFunctionToLoopPassAdaptor(
        std::move(LPM), EnableMSSALoopDependency,
        /*UseBlockFrequencyInfo=*/true));
    FPM.addPass(SimplifyCFGPass());
    FPM.addPass(InstCombinePass());
  }
  if (IsLTO) {
    FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true)));
  } else {
    // Now that we've formed fast to execute loop structures, we do further
    // optimizations. These are run afterward as they might block doing complex
    // analyses and transforms such as what are needed for loop vectorization.
    // Cleanup after loop vectorization, etc. Simplification passes like CVP and
    // GVN, loop transforms, and others have already run, so it's now better to
    // convert to more optimized IR using more aggressive simplify CFG options.
    // The extra sinking transform can create larger basic blocks, so do this
    // before SLP vectorization.
    FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
                                    .forwardSwitchCondToPhi(true)
                                    .convertSwitchToLookupTable(true)
                                    .needCanonicalLoops(false)
                                    .hoistCommonInsts(true)
                                    .sinkCommonInsts(true)));
  }
  if (IsLTO) {
    FPM.addPass(SCCPPass());
    FPM.addPass(InstCombinePass());
    FPM.addPass(BDCEPass());
  }
  // Optimize parallel scalar instruction chains into SIMD instructions.
  if (PTO.SLPVectorization) {
    FPM.addPass(SLPVectorizerPass());
    if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
      FPM.addPass(EarlyCSEPass());
    }
  }
  // Enhance/cleanup vector code.
  FPM.addPass(VectorCombinePass());
  if (!IsLTO) {
    FPM.addPass(InstCombinePass());
    // Unroll small loops to hide loop backedge latency and saturate any
    // parallel execution resources of an out-of-order processor. We also then
    // need to clean up redundancies and loop invariant code.
    // FIXME: It would be really good to use a loop-integrated instruction
    // combiner for cleanup here so that the unrolling and LICM can be pipelined
    // across the loop nests.
    // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
    if (EnableUnrollAndJam && PTO.LoopUnrolling) {
      FPM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel()));
    }
    FPM.addPass(LoopUnrollPass(LoopUnrollOptions(
        Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
        PTO.ForgetAllSCEVInLoopUnroll)));
    FPM.addPass(WarnMissedTransformationsPass());
    FPM.addPass(InstCombinePass());
    FPM.addPass(
        RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
    FPM.addPass(createFunctionToLoopPassAdaptor(
        LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
        EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true));
  }
  // Now that we've vectorized and unrolled loops, we may have more refined
  // alignment information, try to re-derive it here.
  FPM.addPass(AlignmentFromAssumptionsPass());
  if (IsLTO)
    FPM.addPass(InstCombinePass());
 }
 ModulePassManager
 PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
                                             bool LTOPreLink) {
@ -1295,91 +1416,7 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
  // from the TargetLibraryInfo.
  OptimizePM.addPass(InjectTLIMappings());
-  // Now run the core loop vectorizer.
+  addVectorPasses(Level, OptimizePM, /* IsLTO */ false);
  OptimizePM.addPass(LoopVectorizePass(
      LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
  // Eliminate loads by forwarding stores from the previous iteration to loads
  // of the current iteration.
  OptimizePM.addPass(LoopLoadEliminationPass());
  // Cleanup after the loop optimization passes.
  OptimizePM.addPass(InstCombinePass());
  if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
    // At higher optimization levels, try to clean up any runtime overlap and
    // alignment checks inserted by the vectorizer. We want to track correlated
    // runtime checks for two inner loops in the same outer loop, fold any
    // common computations, hoist loop-invariant aspects out of any outer loop,
    // and unswitch the runtime checks if possible. Once hoisted, we may have
    // dead (or speculatable) control flows or more combining opportunities.
    OptimizePM.addPass(EarlyCSEPass());
    OptimizePM.addPass(CorrelatedValuePropagationPass());
    OptimizePM.addPass(InstCombinePass());
    LoopPassManager LPM;
    LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
    LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level ==
                                       OptimizationLevel::O3));
    OptimizePM.addPass(
        RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
    OptimizePM.addPass(createFunctionToLoopPassAdaptor(
        std::move(LPM), EnableMSSALoopDependency,
        /*UseBlockFrequencyInfo=*/true));
    OptimizePM.addPass(SimplifyCFGPass());
    OptimizePM.addPass(InstCombinePass());
  }
  // Now that we've formed fast to execute loop structures, we do further
  // optimizations. These are run afterward as they might block doing complex
  // analyses and transforms such as what are needed for loop vectorization.
  // Cleanup after loop vectorization, etc. Simplification passes like CVP and
  // GVN, loop transforms, and others have already run, so it's now better to
  // convert to more optimized IR using more aggressive simplify CFG options.
  // The extra sinking transform can create larger basic blocks, so do this
  // before SLP vectorization.
  OptimizePM.addPass(SimplifyCFGPass(SimplifyCFGOptions()
                                         .forwardSwitchCondToPhi(true)
                                         .convertSwitchToLookupTable(true)
                                         .needCanonicalLoops(false)
                                         .hoistCommonInsts(true)
                                         .sinkCommonInsts(true)));
  // Optimize parallel scalar instruction chains into SIMD instructions.
  if (PTO.SLPVectorization) {
    OptimizePM.addPass(SLPVectorizerPass());
    if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
      OptimizePM.addPass(EarlyCSEPass());
    }
  }
  // Enhance/cleanup vector code.
  OptimizePM.addPass(VectorCombinePass());
  OptimizePM.addPass(InstCombinePass());
  // Unroll small loops to hide loop backedge latency and saturate any parallel
  // execution resources of an out-of-order processor. We also then need to
  // clean up redundancies and loop invariant code.
  // FIXME: It would be really good to use a loop-integrated instruction
  // combiner for cleanup here so that the unrolling and LICM can be pipelined
  // across the loop nests.
  // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
  if (EnableUnrollAndJam && PTO.LoopUnrolling) {
    OptimizePM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel()));
  }
  OptimizePM.addPass(LoopUnrollPass(LoopUnrollOptions(
      Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
      PTO.ForgetAllSCEVInLoopUnroll)));
  OptimizePM.addPass(WarnMissedTransformationsPass());
  OptimizePM.addPass(InstCombinePass());
  OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
  OptimizePM.addPass(createFunctionToLoopPassAdaptor(
      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
      EnableMSSALoopDependency, /*UseBlockFrequencyInfo=*/true));
  // Now that we've vectorized and unrolled loops, we may have more refined
  // alignment information, try to re-derive it here.
  OptimizePM.addPass(AlignmentFromAssumptionsPass());
  // Split out cold code. Splitting is done late to avoid hiding context from
  // other optimizations and inadvertently regressing performance. The tradeoff
@ -1825,39 +1862,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
      std::move(LPM), /*UseMemorySSA=*/false, /*UseBlockFrequencyInfo=*/true));
  MainFPM.addPass(LoopDistributePass());
  MainFPM.addPass(LoopVectorizePass(
      LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
  // The vectorizer may have significantly shortened a loop body; unroll again.
  MainFPM.addPass(LoopUnrollPass(LoopUnrollOptions(
      Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
      PTO.ForgetAllSCEVInLoopUnroll)));
-  MainFPM.addPass(WarnMissedTransformationsPass());
+  addVectorPasses(Level, MainFPM, /* IsLTO */ true);
  MainFPM.addPass(InstCombinePass());
  MainFPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().hoistCommonInsts(true)));
  MainFPM.addPass(SCCPPass());
  MainFPM.addPass(InstCombinePass());
  MainFPM.addPass(BDCEPass());
  // More scalar chains could be vectorized due to more alias information
  if (PTO.SLPVectorization) {
    MainFPM.addPass(SLPVectorizerPass());
    if (Level.getSpeedupLevel() > 1 && ExtraVectorizerPasses) {
      MainFPM.addPass(EarlyCSEPass());
    }
  }
  MainFPM.addPass(VectorCombinePass()); // Clean up partial vectorization.
  // After vectorization, assume intrinsics may tell us more about pointer
  // alignments.
  MainFPM.addPass(AlignmentFromAssumptionsPass());
  // FIXME: Conditionally run LoadCombine here, after it's ported
  // (in case we still have this pass, given its questionable usefulness).
  MainFPM.addPass(InstCombinePass());
  invokePeepholeEPCallbacks(MainFPM, Level);
  MainFPM.addPass(JumpThreadingPass(/*InsertFreezeWhenUnfoldingSelect*/ true));
  MPM.addPass(createModuleToFunctionPassAdaptor(std::move(MainFPM)));
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@ -523,6 +523,124 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
    MPM.add(createControlHeightReductionLegacyPass());
 }
 /// FIXME: Should LTO cause any differences to this set of passes?
 void PassManagerBuilder::addVectorPasses(legacy::PassManagerBase &PM,
                                         bool IsLTO) {
  PM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
  if (IsLTO) {
    // The vectorizer may have significantly shortened a loop body; unroll
    // again. Unroll small loops to hide loop backedge latency and saturate any
    // parallel execution resources of an out-of-order processor. We also then
    // need to clean up redundancies and loop invariant code.
    // FIXME: It would be really good to use a loop-integrated instruction
    // combiner for cleanup here so that the unrolling and LICM can be pipelined
    // across the loop nests.
    // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
    if (EnableUnrollAndJam && !DisableUnrollLoops)
      PM.add(createLoopUnrollAndJamPass(OptLevel));
    PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
                                ForgetAllSCEVInLoopUnroll));
    PM.add(createWarnMissedTransformationsPass());
  }
  if (!IsLTO) {
    // Eliminate loads by forwarding stores from the previous iteration to loads
    // of the current iteration.
    PM.add(createLoopLoadEliminationPass());
  }
  // Cleanup after the loop optimization passes.
  PM.add(createInstructionCombiningPass());
  if (OptLevel > 1 && ExtraVectorizerPasses) {
    // At higher optimization levels, try to clean up any runtime overlap and
    // alignment checks inserted by the vectorizer. We want to track correlated
    // runtime checks for two inner loops in the same outer loop, fold any
    // common computations, hoist loop-invariant aspects out of any outer loop,
    // and unswitch the runtime checks if possible. Once hoisted, we may have
    // dead (or speculatable) control flows or more combining opportunities.
    PM.add(createEarlyCSEPass());
    PM.add(createCorrelatedValuePropagationPass());
    PM.add(createInstructionCombiningPass());
    PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
    PM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
    PM.add(createCFGSimplificationPass());
    PM.add(createInstructionCombiningPass());
  }
  if (IsLTO) {
    PM.add(createCFGSimplificationPass(SimplifyCFGOptions() // if-convert
                                           .hoistCommonInsts(true)));
  } else {
    // Now that we've formed fast to execute loop structures, we do further
    // optimizations. These are run afterward as they might block doing complex
    // analyses and transforms such as what are needed for loop vectorization.
    // Cleanup after loop vectorization, etc. Simplification passes like CVP and
    // GVN, loop transforms, and others have already run, so it's now better to
    // convert to more optimized IR using more aggressive simplify CFG options.
    // The extra sinking transform can create larger basic blocks, so do this
    // before SLP vectorization.
    PM.add(createCFGSimplificationPass(SimplifyCFGOptions()
                                           .forwardSwitchCondToPhi(true)
                                           .convertSwitchToLookupTable(true)
                                           .needCanonicalLoops(false)
                                           .hoistCommonInsts(true)
                                           .sinkCommonInsts(true)));
  }
  if (IsLTO) {
    PM.add(createSCCPPass());                 // Propagate exposed constants
    PM.add(createInstructionCombiningPass()); // Clean up again
    PM.add(createBitTrackingDCEPass());
  }
  // Optimize parallel scalar instruction chains into SIMD instructions.
  if (SLPVectorize) {
    PM.add(createSLPVectorizerPass());
    if (OptLevel > 1 && ExtraVectorizerPasses)
      PM.add(createEarlyCSEPass());
  }
  // Enhance/cleanup vector code.
  PM.add(createVectorCombinePass());
  if (!IsLTO) {
    addExtensionsToPM(EP_Peephole, PM);
    PM.add(createInstructionCombiningPass());
    if (EnableUnrollAndJam && !DisableUnrollLoops) {
      // Unroll and Jam. We do this before unroll but need to be in a separate
      // loop pass manager in order for the outer loop to be processed by
      // unroll and jam before the inner loop is unrolled.
      PM.add(createLoopUnrollAndJamPass(OptLevel));
    }
    // Unroll small loops
    PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
                                ForgetAllSCEVInLoopUnroll));
    if (!DisableUnrollLoops) {
      // LoopUnroll may generate some redundency to cleanup.
      PM.add(createInstructionCombiningPass());
      // Runtime unrolling will introduce runtime check in loop prologue. If the
      // unrolled loop is a inner loop, then the prologue will be inside the
      // outer loop. LICM pass can help to promote the runtime check out if the
      // checked value is loop invariant.
      PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
    }
    PM.add(createWarnMissedTransformationsPass());
  }
  // After vectorization and unrolling, assume intrinsics may tell us more
  // about pointer alignments.
  PM.add(createAlignmentFromAssumptionsPass());
  if (IsLTO)
    PM.add(createInstructionCombiningPass());
 }
 void PassManagerBuilder::populateModulePassManager(
    legacy::PassManagerBase &MPM) {
  // Whether this is a default or *LTO pre-link pipeline. The FullLTO post-link
@ -794,86 +912,7 @@ void PassManagerBuilder::populateModulePassManager(
  // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
  MPM.add(createLoopDistributePass());
-  MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
+  addVectorPasses(MPM, /* IsLTO */ false);
  // Eliminate loads by forwarding stores from the previous iteration to loads
  // of the current iteration.
  MPM.add(createLoopLoadEliminationPass());
  // FIXME: Because of #pragma vectorize enable, the passes below are always
  // inserted in the pipeline, even when the vectorizer doesn't run (ex. when
  // on -O1 and no #pragma is found). Would be good to have these two passes
  // as function calls, so that we can only pass them when the vectorizer
  // changed the code.
  MPM.add(createInstructionCombiningPass());
  if (OptLevel > 1 && ExtraVectorizerPasses) {
    // At higher optimization levels, try to clean up any runtime overlap and
    // alignment checks inserted by the vectorizer. We want to track correllated
    // runtime checks for two inner loops in the same outer loop, fold any
    // common computations, hoist loop-invariant aspects out of any outer loop,
    // and unswitch the runtime checks if possible. Once hoisted, we may have
    // dead (or speculatable) control flows or more combining opportunities.
    MPM.add(createEarlyCSEPass());
    MPM.add(createCorrelatedValuePropagationPass());
    MPM.add(createInstructionCombiningPass());
    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
    MPM.add(createCFGSimplificationPass());
    MPM.add(createInstructionCombiningPass());
  }
  // Cleanup after loop vectorization, etc. Simplification passes like CVP and
  // GVN, loop transforms, and others have already run, so it's now better to
  // convert to more optimized IR using more aggressive simplify CFG options.
  // The extra sinking transform can create larger basic blocks, so do this
  // before SLP vectorization.
  MPM.add(createCFGSimplificationPass(SimplifyCFGOptions()
                                          .forwardSwitchCondToPhi(true)
                                          .convertSwitchToLookupTable(true)
                                          .needCanonicalLoops(false)
                                          .hoistCommonInsts(true)
                                          .sinkCommonInsts(true)));
  if (SLPVectorize) {
    MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
    if (OptLevel > 1 && ExtraVectorizerPasses) {
      MPM.add(createEarlyCSEPass());
    }
  }
  // Enhance/cleanup vector code.
  MPM.add(createVectorCombinePass());
  addExtensionsToPM(EP_Peephole, MPM);
  MPM.add(createInstructionCombiningPass());
  if (EnableUnrollAndJam && !DisableUnrollLoops) {
    // Unroll and Jam. We do this before unroll but need to be in a separate
    // loop pass manager in order for the outer loop to be processed by
    // unroll and jam before the inner loop is unrolled.
    MPM.add(createLoopUnrollAndJamPass(OptLevel));
  }
  // Unroll small loops
  MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
                               ForgetAllSCEVInLoopUnroll));
  if (!DisableUnrollLoops) {
    // LoopUnroll may generate some redundency to cleanup.
    MPM.add(createInstructionCombiningPass());
    // Runtime unrolling will introduce runtime check in loop prologue. If the
    // unrolled loop is a inner loop, then the prologue will be inside the
    // outer loop. LICM pass can help to promote the runtime check out if the
    // checked value is loop invariant.
    MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
  }
  MPM.add(createWarnMissedTransformationsPass());
  // After vectorization and unrolling, assume intrinsics may tell us more
  // about pointer alignments.
  MPM.add(createAlignmentFromAssumptionsPass());
  // FIXME: We shouldn't bother with this anymore.
  MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
@ -1083,35 +1122,9 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
  PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
                                    ForgetAllSCEVInLoopUnroll));
  PM.add(createLoopDistributePass());
  PM.add(createLoopVectorizePass(true, !LoopVectorize));
  // The vectorizer may have significantly shortened a loop body; unroll again.
  PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
                              ForgetAllSCEVInLoopUnroll));
-  PM.add(createWarnMissedTransformationsPass());
+  addVectorPasses(PM, /* IsLTO */ true);
  // Now that we've optimized loops (in particular loop induction variables),
  // we may have exposed more scalar opportunities. Run parts of the scalar
  // optimizer again at this point.
  PM.add(createInstructionCombiningPass()); // Initial cleanup
  PM.add(createCFGSimplificationPass(SimplifyCFGOptions() // if-convert
                                         .hoistCommonInsts(true)));
  PM.add(createSCCPPass()); // Propagate exposed constants
  PM.add(createInstructionCombiningPass()); // Clean up again
  PM.add(createBitTrackingDCEPass());
  // More scalar chains could be vectorized due to more alias information
  if (SLPVectorize)
    PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
  PM.add(createVectorCombinePass()); // Clean up partial vectorization.
  // After vectorization, assume intrinsics may tell us more about pointer
  // alignments.
  PM.add(createAlignmentFromAssumptionsPass());
  // Cleanup and simplify the code after the scalar optimizations.
  PM.add(createInstructionCombiningPass());
  addExtensionsToPM(EP_Peephole, PM);
  PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true));