diff --git a/CODE_OWNERS.TXT b/CODE_OWNERS.TXT
index 3339c039ff8..619844256ad 100644
--- a/CODE_OWNERS.TXT
+++ b/CODE_OWNERS.TXT
@@ -70,7 +70,7 @@ D: Branch weights and BlockFrequencyInfo
 
 N: Hal Finkel
 E: hfinkel@anl.gov
-D: BBVectorize, the loop reroller, alias analysis and the PowerPC target
+D: The loop reroller, alias analysis and the PowerPC target
 
 N: Dan Gohman
 E: sunfish@mozilla.com
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 5939805a981..ddb31acfd02 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -54,8 +54,9 @@ Non-comprehensive list of changes in this release
   its nature as a general purpose PDB manipulation / diagnostics tool that does
   more than just dumping contents.
   
-
-* ... next change ...
+* The ``BBVectorize`` pass has been removed. It was fully replaced and no
+  longer used back in 2014 but we didn't get around to removing it. Now it is
+  gone. The SLP vectorizer is the suggested non-loop vectorization pass.
 
 .. NOTE
    If you would like to document a larger change, then you can add a
@@ -111,7 +112,11 @@ Changes to the OCaml bindings
 Changes to the C API
 --------------------
 
- During this release ...
+* Deprecated the ``LLVMAddBBVectorizePass`` interface since the ``BBVectorize``
+  pass has been removed. It is now a no-op and will be removed in the next
+  release. Use ``LLVMAddSLPVectorizePass`` instead to get the supported SLP
+  vectorizer.
+
 
 External Open Source Projects Using LLVM 5
 ==========================================
diff --git a/include/llvm-c/Transforms/Vectorize.h b/include/llvm-c/Transforms/Vectorize.h
index a82ef49cb16..cf8306aee76 100644
--- a/include/llvm-c/Transforms/Vectorize.h
+++ b/include/llvm-c/Transforms/Vectorize.h
@@ -33,7 +33,7 @@ extern "C" {
  * @{
  */
 
-/** See llvm::createBBVectorizePass function. */
+/** DEPRECATED - Use LLVMAddSLPVectorizePass */
 void LLVMAddBBVectorizePass(LLVMPassManagerRef PM);
 
 /** See llvm::createLoopVectorizePass function. */
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index a52fa3b542a..aab14070dbd 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -70,7 +70,6 @@ void initializeAlwaysInlinerLegacyPassPass(PassRegistry&);
 void initializeArgPromotionPass(PassRegistry&);
 void initializeAssumptionCacheTrackerPass(PassRegistry&);
 void initializeAtomicExpandPass(PassRegistry&);
-void initializeBBVectorizePass(PassRegistry&);
 void initializeBDCELegacyPassPass(PassRegistry&);
 void initializeBarrierNoopPass(PassRegistry&);
 void initializeBasicAAWrapperPassPass(PassRegistry&);
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index c309ddbe2f0..d07c15c1013 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -195,7 +195,6 @@ namespace {
       (void) llvm::createLoopVectorizePass();
       (void) llvm::createSLPVectorizerPass();
       (void) llvm::createLoadStoreVectorizerPass();
-      (void) llvm::createBBVectorizePass();
       (void) llvm::createPartiallyInlineLibCallsPass();
       (void) llvm::createScalarizerPass();
       (void) llvm::createSeparateConstOffsetFromGEPPass();
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index db4bfb15f51..276306f686f 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -145,7 +145,6 @@ public:
   bool DisableTailCalls;
   bool DisableUnitAtATime;
   bool DisableUnrollLoops;
-  bool BBVectorize;
   bool SLPVectorize;
   bool LoopVectorize;
   bool RerollLoops;
diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h
index f734e299c6e..19845e471e4 100644
--- a/include/llvm/Transforms/Vectorize.h
+++ b/include/llvm/Transforms/Vectorize.h
@@ -106,13 +106,6 @@ struct VectorizeConfig {
   VectorizeConfig();
 };
 
-//===----------------------------------------------------------------------===//
-//
-// BBVectorize - A basic-block vectorization pass.
-//
-BasicBlockPass *
-createBBVectorizePass(const VectorizeConfig &C = VectorizeConfig());
-
 //===----------------------------------------------------------------------===//
 //
 // LoopVectorize - Create a loop vectorization pass.
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 5538756b8bf..5b1b58b89c3 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -55,10 +55,6 @@ static cl::opt<bool>
 RunSLPVectorization("vectorize-slp", cl::Hidden,
                     cl::desc("Run the SLP vectorization passes"));
 
-static cl::opt<bool>
-RunBBVectorization("vectorize-slp-aggressive", cl::Hidden,
-                    cl::desc("Run the BB vectorization passes"));
-
 static cl::opt<bool>
 UseGVNAfterVectorization("use-gvn-after-vectorization",
   cl::init(false), cl::Hidden,
@@ -166,7 +162,6 @@ PassManagerBuilder::PassManagerBuilder() {
     Inliner = nullptr;
     DisableUnitAtATime = false;
     DisableUnrollLoops = false;
-    BBVectorize = RunBBVectorization;
     SLPVectorize = RunSLPVectorization;
     LoopVectorize = RunLoopVectorization;
     RerollLoops = RunLoopRerolling;
@@ -384,26 +379,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
 
   if (RerollLoops)
     MPM.add(createLoopRerollPass());
-  if (!RunSLPAfterLoopVectorization) {
-    if (SLPVectorize)
-      MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
-
-    if (BBVectorize) {
-      MPM.add(createBBVectorizePass());
-      addInstructionCombiningPass(MPM);
-      addExtensionsToPM(EP_Peephole, MPM);
-      if (OptLevel > 1 && UseGVNAfterVectorization)
-        MPM.add(NewGVN
-                    ? createNewGVNPass()
-                    : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
-      else
-        MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
-
-      // BBVectorize may have significantly shortened a loop body; unroll again.
-      if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass(OptLevel));
-    }
-  }
+  if (!RunSLPAfterLoopVectorization && SLPVectorize)
+    MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
 
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
@@ -635,28 +612,10 @@ void PassManagerBuilder::populateModulePassManager(
     addInstructionCombiningPass(MPM);
   }
 
-  if (RunSLPAfterLoopVectorization) {
-    if (SLPVectorize) {
-      MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
-      if (OptLevel > 1 && ExtraVectorizerPasses) {
-        MPM.add(createEarlyCSEPass());
-      }
-    }
-
-    if (BBVectorize) {
-      MPM.add(createBBVectorizePass());
-      addInstructionCombiningPass(MPM);
-      addExtensionsToPM(EP_Peephole, MPM);
-      if (OptLevel > 1 && UseGVNAfterVectorization)
-        MPM.add(NewGVN
-                    ? createNewGVNPass()
-                    : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
-      else
-        MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
-
-      // BBVectorize may have significantly shortened a loop body; unroll again.
-      if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass(OptLevel));
+  if (RunSLPAfterLoopVectorization && SLPVectorize) {
+    MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+    if (OptLevel > 1 && ExtraVectorizerPasses) {
+      MPM.add(createEarlyCSEPass());
     }
   }
 
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
deleted file mode 100644
index 78453aaa16c..00000000000
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ /dev/null
@@ -1,3282 +0,0 @@
-//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a basic-block vectorization pass. The algorithm was
-// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
-// et al. It works by looking for chains of pairable operations and then
-// pairing them.
-//
-//===----------------------------------------------------------------------===//
-
-#define BBV_NAME "bb-vectorize"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Vectorize.h"
-#include <algorithm>
-using namespace llvm;
-
-#define DEBUG_TYPE BBV_NAME
-
-static cl::opt<bool>
-IgnoreTargetInfo("bb-vectorize-ignore-target-info",  cl::init(false),
-  cl::Hidden, cl::desc("Ignore target information"));
-
-static cl::opt<unsigned>
-ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
-  cl::desc("The required chain depth for vectorization"));
-
-static cl::opt<bool>
-UseChainDepthWithTI("bb-vectorize-use-chain-depth",  cl::init(false),
-  cl::Hidden, cl::desc("Use the chain depth requirement with"
-                       " target information"));
-
-static cl::opt<unsigned>
-SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
-  cl::desc("The maximum search distance for instruction pairs"));
-
-static cl::opt<bool>
-SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
-  cl::desc("Replicating one element to a pair breaks the chain"));
-
-static cl::opt<unsigned>
-VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
-  cl::desc("The size of the native vector registers"));
-
-static cl::opt<unsigned>
-MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
-  cl::desc("The maximum number of pairing iterations"));
-
-static cl::opt<bool>
-Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to form non-2^n-length vectors"));
-
-static cl::opt<unsigned>
-MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
-  cl::desc("The maximum number of pairable instructions per group"));
-
-static cl::opt<unsigned>
-MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
-  cl::desc("The maximum number of candidate instruction pairs per group"));
-
-static cl::opt<unsigned>
-MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
-  cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
-                       " a full cycle check"));
-
-static cl::opt<bool>
-NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize boolean (i1) values"));
-
-static cl::opt<bool>
-NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize integer values"));
-
-static cl::opt<bool>
-NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize floating-point values"));
-
-// FIXME: This should default to false once pointer vector support works.
-static cl::opt<bool>
-NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
-  cl::desc("Don't try to vectorize pointer values"));
-
-static cl::opt<bool>
-NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize casting (conversion) operations"));
-
-static cl::opt<bool>
-NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize floating-point math intrinsics"));
-
-static cl::opt<bool>
-  NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize BitManipulation intrinsics"));
-
-static cl::opt<bool>
-NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
-
-static cl::opt<bool>
-NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize select instructions"));
-
-static cl::opt<bool>
-NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize comparison instructions"));
-
-static cl::opt<bool>
-NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize getelementptr instructions"));
-
-static cl::opt<bool>
-NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
-  cl::desc("Don't try to vectorize loads and stores"));
-
-static cl::opt<bool>
-AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
-  cl::desc("Only generate aligned loads and stores"));
-
-static cl::opt<bool>
-NoMemOpBoost("bb-vectorize-no-mem-op-boost",
-  cl::init(false), cl::Hidden,
-  cl::desc("Don't boost the chain-depth contribution of loads and stores"));
-
-static cl::opt<bool>
-FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
-  cl::desc("Use a fast instruction dependency analysis"));
-
-#ifndef NDEBUG
-static cl::opt<bool>
-DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " instruction-examination process"));
-static cl::opt<bool>
-DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " candidate-selection process"));
-static cl::opt<bool>
-DebugPairSelection("bb-vectorize-debug-pair-selection",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " pair-selection process"));
-static cl::opt<bool>
-DebugCycleCheck("bb-vectorize-debug-cycle-check",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, output information on the"
-           " cycle-checking process"));
-
-static cl::opt<bool>
-PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
-  cl::init(false), cl::Hidden,
-  cl::desc("When debugging is enabled, dump the basic block after"
-           " every pair is fused"));
-#endif
-
-STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
-
-namespace {
-  struct BBVectorize : public BasicBlockPass {
-    static char ID; // Pass identification, replacement for typeid
-
-    const VectorizeConfig Config;
-
-    BBVectorize(const VectorizeConfig &C = VectorizeConfig())
-      : BasicBlockPass(ID), Config(C) {
-      initializeBBVectorizePass(*PassRegistry::getPassRegistry());
-    }
-
-    BBVectorize(Pass *P, Function &F, const VectorizeConfig &C)
-      : BasicBlockPass(ID), Config(C) {
-      AA = &P->getAnalysis<AAResultsWrapperPass>().getAAResults();
-      DT = &P->getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      SE = &P->getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-      TLI = &P->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-      TTI = IgnoreTargetInfo
-                ? nullptr
-                : &P->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    }
-
-    typedef std::pair<Value *, Value *> ValuePair;
-    typedef std::pair<ValuePair, int> ValuePairWithCost;
-    typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
-    typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
-    typedef std::pair<VPPair, unsigned> VPPairWithType;
-
-    AliasAnalysis *AA;
-    DominatorTree *DT;
-    ScalarEvolution *SE;
-    const TargetLibraryInfo *TLI;
-    const TargetTransformInfo *TTI;
-
-    // FIXME: const correct?
-
-    bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
-
-    bool getCandidatePairs(BasicBlock &BB,
-                       BasicBlock::iterator &Start,
-                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                       DenseSet<ValuePair> &FixedOrderPairs,
-                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                       std::vector<Value *> &PairableInsts, bool NonPow2Len);
-
-    // FIXME: The current implementation does not account for pairs that
-    // are connected in multiple ways. For example:
-    //   C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
-    enum PairConnectionType {
-      PairConnectionDirect,
-      PairConnectionSwap,
-      PairConnectionSplat
-    };
-
-    void computeConnectedPairs(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes);
-
-    void buildDepMap(BasicBlock &BB,
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             std::vector<Value *> &PairableInsts,
-             DenseSet<ValuePair> &PairableInstUsers);
-
-    void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             DenseMap<ValuePair, int> &CandidatePairCostSavings,
-             std::vector<Value *> &PairableInsts,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<Value *, Value *>& ChosenPairs);
-
-    void fuseChosenPairs(BasicBlock &BB,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<Value *, Value *>& ChosenPairs,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps);
-
-
-    bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
-
-    bool areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore, bool NonPow2Len,
-                       int &CostSavings, int &FixedOrder);
-
-    bool trackUsesOfI(DenseSet<Value *> &Users,
-                      AliasSetTracker &WriteSet, Instruction *I,
-                      Instruction *J, bool UpdateUsers = true,
-                      DenseSet<ValuePair> *LoadMoveSetPairs = nullptr);
-
-  void computePairsConnectedTo(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             ValuePair P);
-
-    bool pairsConflict(ValuePair P, ValuePair Q,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> >
-               *PairableInstUserMap = nullptr,
-             DenseSet<VPPair> *PairableInstUserPairSet = nullptr);
-
-    bool pairWillFormCycle(ValuePair P,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers,
-             DenseSet<ValuePair> &CurrentPairs);
-
-    void pruneDAGFor(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-             DenseSet<VPPair> &PairableInstUserPairSet,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseMap<ValuePair, size_t> &DAG,
-             DenseSet<ValuePair> &PrunedDAG, ValuePair J,
-             bool UseCycleCheck);
-
-    void buildInitialDAGFor(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseMap<ValuePair, size_t> &DAG, ValuePair J);
-
-    void findBestDAGFor(
-             DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-             DenseSet<ValuePair> &CandidatePairsSet,
-             DenseMap<ValuePair, int> &CandidatePairCostSavings,
-             std::vector<Value *> &PairableInsts,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-             DenseSet<VPPair> &PairableInstUserPairSet,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
-             int &BestEffSize, Value *II, std::vector<Value *>&JJ,
-             bool UseCycleCheck);
-
-    Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o);
-
-    void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
-                     unsigned MaskOffset, unsigned NumInElem,
-                     unsigned NumInElem1, unsigned IdxOffset,
-                     std::vector<Constant*> &Mask);
-
-    Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
-                     Instruction *J);
-
-    bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
-                       unsigned o, Value *&LOp, unsigned numElemL,
-                       Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
-                       unsigned IdxOff = 0);
-
-    Value *getReplacementInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o, bool IBeforeJ);
-
-    void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands,
-                     bool IBeforeJ);
-
-    void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, Instruction *K,
-                     Instruction *&InsertionPt, Instruction *&K1,
-                     Instruction *&K2);
-
-    void collectPairLoadMoveSet(BasicBlock &BB,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I);
-
-    void collectLoadMoveSet(BasicBlock &BB,
-                     std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs);
-
-    bool canMoveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I, Instruction *J);
-
-    void moveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *&InsertionPt,
-                     Instruction *I, Instruction *J);
-
-    bool vectorizeBB(BasicBlock &BB) {
-      if (skipBasicBlock(BB))
-        return false;
-      if (!DT->isReachableFromEntry(&BB)) {
-        DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
-              " in " << BB.getParent()->getName() << "\n");
-        return false;
-      }
-
-      DEBUG(if (TTI) dbgs() << "BBV: using target information\n");
-
-      bool changed = false;
-      // Iterate a sufficient number of times to merge types of size 1 bit,
-      // then 2 bits, then 4, etc. up to half of the target vector width of the
-      // target vector register.
-      unsigned n = 1;
-      for (unsigned v = 2;
-           (TTI || v <= Config.VectorBits) &&
-           (!Config.MaxIter || n <= Config.MaxIter);
-           v *= 2, ++n) {
-        DEBUG(dbgs() << "BBV: fusing loop #" << n <<
-              " for " << BB.getName() << " in " <<
-              BB.getParent()->getName() << "...\n");
-        if (vectorizePairs(BB))
-          changed = true;
-        else
-          break;
-      }
-
-      if (changed && !Pow2LenOnly) {
-        ++n;
-        for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
-          DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
-                n << " for " << BB.getName() << " in " <<
-                BB.getParent()->getName() << "...\n");
-          if (!vectorizePairs(BB, true)) break;
-        }
-      }
-
-      DEBUG(dbgs() << "BBV: done!\n");
-      return changed;
-    }
-
-    bool runOnBasicBlock(BasicBlock &BB) override {
-      // OptimizeNone check deferred to vectorizeBB().
-
-      AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-      DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-      SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-      TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-      TTI = IgnoreTargetInfo
-                ? nullptr
-                : &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
-                      *BB.getParent());
-
-      return vectorizeBB(BB);
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      BasicBlockPass::getAnalysisUsage(AU);
-      AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<ScalarEvolutionWrapperPass>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addRequired<TargetTransformInfoWrapperPass>();
-      AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-      AU.addPreserved<ScalarEvolutionWrapperPass>();
-      AU.addPreserved<SCEVAAWrapperPass>();
-      AU.setPreservesCFG();
-    }
-
-    static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
-      assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
-             "Cannot form vector from incompatible scalar types");
-      Type *STy = ElemTy->getScalarType();
-
-      unsigned numElem;
-      if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
-        numElem = VTy->getNumElements();
-      } else {
-        numElem = 1;
-      }
-
-      if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) {
-        numElem += VTy->getNumElements();
-      } else {
-        numElem += 1;
-      }
-
-      return VectorType::get(STy, numElem);
-    }
-
-    static inline void getInstructionTypes(Instruction *I,
-                                           Type *&T1, Type *&T2) {
-      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-        // For stores, it is the value type, not the pointer type that matters
-        // because the value is what will come from a vector register.
-
-        Value *IVal = SI->getValueOperand();
-        T1 = IVal->getType();
-      } else {
-        T1 = I->getType();
-      }
-
-      if (CastInst *CI = dyn_cast<CastInst>(I))
-        T2 = CI->getSrcTy();
-      else
-        T2 = T1;
-
-      if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
-        T2 = SI->getCondition()->getType();
-      } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
-        T2 = SI->getOperand(0)->getType();
-      } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
-        T2 = CI->getOperand(0)->getType();
-      }
-    }
-
-    // Returns the weight associated with the provided value. A chain of
-    // candidate pairs has a length given by the sum of the weights of its
-    // members (one weight per pair; the weight of each member of the pair
-    // is assumed to be the same). This length is then compared to the
-    // chain-length threshold to determine if a given chain is significant
-    // enough to be vectorized. The length is also used in comparing
-    // candidate chains where longer chains are considered to be better.
-    // Note: when this function returns 0, the resulting instructions are
-    // not actually fused.
-    inline size_t getDepthFactor(Value *V) {
-      // InsertElement and ExtractElement have a depth factor of zero. This is
-      // for two reasons: First, they cannot be usefully fused. Second, because
-      // the pass generates a lot of these, they can confuse the simple metric
-      // used to compare the dags in the next iteration. Thus, giving them a
-      // weight of zero allows the pass to essentially ignore them in
-      // subsequent iterations when looking for vectorization opportunities
-      // while still tracking dependency chains that flow through those
-      // instructions.
-      if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
-        return 0;
-
-      // Give a load or store half of the required depth so that load/store
-      // pairs will vectorize.
-      if (!Config.NoMemOpBoost && (isa<LoadInst>(V) || isa<StoreInst>(V)))
-        return Config.ReqChainDepth/2;
-
-      return 1;
-    }
-
-    // Returns the cost of the provided instruction using TTI.
-    // This does not handle loads and stores.
-    unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2,
-                          TargetTransformInfo::OperandValueKind Op1VK =
-                              TargetTransformInfo::OK_AnyValue,
-                          TargetTransformInfo::OperandValueKind Op2VK =
-                              TargetTransformInfo::OK_AnyValue,
-                          const Instruction *I = nullptr) {
-      switch (Opcode) {
-      default: break;
-      case Instruction::GetElementPtr:
-        // We mark this instruction as zero-cost because scalar GEPs are usually
-        // lowered to the instruction addressing mode. At the moment we don't
-        // generate vector GEPs.
-        return 0;
-      case Instruction::Br:
-        return TTI->getCFInstrCost(Opcode);
-      case Instruction::PHI:
-        return 0;
-      case Instruction::Add:
-      case Instruction::FAdd:
-      case Instruction::Sub:
-      case Instruction::FSub:
-      case Instruction::Mul:
-      case Instruction::FMul:
-      case Instruction::UDiv:
-      case Instruction::SDiv:
-      case Instruction::FDiv:
-      case Instruction::URem:
-      case Instruction::SRem:
-      case Instruction::FRem:
-      case Instruction::Shl:
-      case Instruction::LShr:
-      case Instruction::AShr:
-      case Instruction::And:
-      case Instruction::Or:
-      case Instruction::Xor:
-        return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK);
-      case Instruction::Select:
-      case Instruction::ICmp:
-      case Instruction::FCmp:
-        return TTI->getCmpSelInstrCost(Opcode, T1, T2, I);
-      case Instruction::ZExt:
-      case Instruction::SExt:
-      case Instruction::FPToUI:
-      case Instruction::FPToSI:
-      case Instruction::FPExt:
-      case Instruction::PtrToInt:
-      case Instruction::IntToPtr:
-      case Instruction::SIToFP:
-      case Instruction::UIToFP:
-      case Instruction::Trunc:
-      case Instruction::FPTrunc:
-      case Instruction::BitCast:
-      case Instruction::ShuffleVector:
-        return TTI->getCastInstrCost(Opcode, T1, T2, I);
-      }
-
-      return 1;
-    }
-
-    // This determines the relative offset of two loads or stores, returning
-    // true if the offset could be determined to be some constant value.
-    // For example, if OffsetInElmts == 1, then J accesses the memory directly
-    // after I; if OffsetInElmts == -1 then I accesses the memory
-    // directly after J.
-    bool getPairPtrInfo(Instruction *I, Instruction *J,
-        Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
-        unsigned &IAddressSpace, unsigned &JAddressSpace,
-        int64_t &OffsetInElmts, bool ComputeOffset = true) {
-      OffsetInElmts = 0;
-      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        LoadInst *LJ = cast<LoadInst>(J);
-        IPtr = LI->getPointerOperand();
-        JPtr = LJ->getPointerOperand();
-        IAlignment = LI->getAlignment();
-        JAlignment = LJ->getAlignment();
-        IAddressSpace = LI->getPointerAddressSpace();
-        JAddressSpace = LJ->getPointerAddressSpace();
-      } else {
-        StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
-        IPtr = SI->getPointerOperand();
-        JPtr = SJ->getPointerOperand();
-        IAlignment = SI->getAlignment();
-        JAlignment = SJ->getAlignment();
-        IAddressSpace = SI->getPointerAddressSpace();
-        JAddressSpace = SJ->getPointerAddressSpace();
-      }
-
-      if (!ComputeOffset)
-        return true;
-
-      const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
-      const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
-
-      // If this is a trivial offset, then we'll get something like
-      // 1*sizeof(type). With target data, which we need anyway, this will get
-      // constant folded into a number.
-      const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
-      if (const SCEVConstant *ConstOffSCEV =
-            dyn_cast<SCEVConstant>(OffsetSCEV)) {
-        ConstantInt *IntOff = ConstOffSCEV->getValue();
-        int64_t Offset = IntOff->getSExtValue();
-        const DataLayout &DL = I->getModule()->getDataLayout();
-        Type *VTy = IPtr->getType()->getPointerElementType();
-        int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy);
-
-        Type *VTy2 = JPtr->getType()->getPointerElementType();
-        if (VTy != VTy2 && Offset < 0) {
-          int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2);
-          OffsetInElmts = Offset/VTy2TSS;
-          return (std::abs(Offset) % VTy2TSS) == 0;
-        }
-
-        OffsetInElmts = Offset/VTyTSS;
-        return (std::abs(Offset) % VTyTSS) == 0;
-      }
-
-      return false;
-    }
-
-    // Returns true if the provided CallInst represents an intrinsic that can
-    // be vectorized.
-    bool isVectorizableIntrinsic(CallInst* I) {
-      Function *F = I->getCalledFunction();
-      if (!F) return false;
-
-      Intrinsic::ID IID = F->getIntrinsicID();
-      if (!IID) return false;
-
-      switch(IID) {
-      default:
-        return false;
-      case Intrinsic::sqrt:
-      case Intrinsic::powi:
-      case Intrinsic::sin:
-      case Intrinsic::cos:
-      case Intrinsic::log:
-      case Intrinsic::log2:
-      case Intrinsic::log10:
-      case Intrinsic::exp:
-      case Intrinsic::exp2:
-      case Intrinsic::pow:
-      case Intrinsic::round:
-      case Intrinsic::copysign:
-      case Intrinsic::ceil:
-      case Intrinsic::nearbyint:
-      case Intrinsic::rint:
-      case Intrinsic::trunc:
-      case Intrinsic::floor:
-      case Intrinsic::fabs:
-      case Intrinsic::minnum:
-      case Intrinsic::maxnum:
-        return Config.VectorizeMath;
-      case Intrinsic::bswap:
-      case Intrinsic::ctpop:
-      case Intrinsic::ctlz:
-      case Intrinsic::cttz:
-        return Config.VectorizeBitManipulations;
-      case Intrinsic::fma:
-      case Intrinsic::fmuladd:
-        return Config.VectorizeFMA;
-      }
-    }
-
-    bool isPureIEChain(InsertElementInst *IE) {
-      InsertElementInst *IENext = IE;
-      do {
-        if (!isa<UndefValue>(IENext->getOperand(0)) &&
-            !isa<InsertElementInst>(IENext->getOperand(0))) {
-          return false;
-        }
-      } while ((IENext =
-                 dyn_cast<InsertElementInst>(IENext->getOperand(0))));
-
-      return true;
-    }
-  };
-
-  // This function implements one vectorization iteration on the provided
-  // basic block. It returns true if the block is changed.
-  bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
-    bool ShouldContinue;
-    BasicBlock::iterator Start = BB.getFirstInsertionPt();
-
-    std::vector<Value *> AllPairableInsts;
-    DenseMap<Value *, Value *> AllChosenPairs;
-    DenseSet<ValuePair> AllFixedOrderPairs;
-    DenseMap<VPPair, unsigned> AllPairConnectionTypes;
-    DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs,
-                                                 AllConnectedPairDeps;
-
-    do {
-      std::vector<Value *> PairableInsts;
-      DenseMap<Value *, std::vector<Value *> > CandidatePairs;
-      DenseSet<ValuePair> FixedOrderPairs;
-      DenseMap<ValuePair, int> CandidatePairCostSavings;
-      ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
-                                         FixedOrderPairs,
-                                         CandidatePairCostSavings,
-                                         PairableInsts, NonPow2Len);
-      if (PairableInsts.empty()) continue;
-
-      // Build the candidate pair set for faster lookups.
-      DenseSet<ValuePair> CandidatePairsSet;
-      for (DenseMap<Value *, std::vector<Value *> >::iterator I =
-           CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I)
-        for (std::vector<Value *>::iterator J = I->second.begin(),
-             JE = I->second.end(); J != JE; ++J)
-          CandidatePairsSet.insert(ValuePair(I->first, *J));
-
-      // Now we have a map of all of the pairable instructions and we need to
-      // select the best possible pairing. A good pairing is one such that the
-      // users of the pair are also paired. This defines a (directed) forest
-      // over the pairs such that two pairs are connected iff the second pair
-      // uses the first.
-
-      // Note that it only matters that both members of the second pair use some
-      // element of the first pair (to allow for splatting).
-
-      DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs,
-                                                   ConnectedPairDeps;
-      DenseMap<VPPair, unsigned> PairConnectionTypes;
-      computeConnectedPairs(CandidatePairs, CandidatePairsSet,
-                            PairableInsts, ConnectedPairs, PairConnectionTypes);
-      if (ConnectedPairs.empty()) continue;
-
-      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
-           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
-           I != IE; ++I)
-        for (std::vector<ValuePair>::iterator J = I->second.begin(),
-             JE = I->second.end(); J != JE; ++J)
-          ConnectedPairDeps[*J].push_back(I->first);
-
-      // Build the pairable-instruction dependency map
-      DenseSet<ValuePair> PairableInstUsers;
-      buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
-
-      // There is now a graph of the connected pairs. For each variable, pick
-      // the pairing with the largest dag meeting the depth requirement on at
-      // least one branch. Then select all pairings that are part of that dag
-      // and remove them from the list of available pairings and pairable
-      // variables.
-
-      DenseMap<Value *, Value *> ChosenPairs;
-      choosePairs(CandidatePairs, CandidatePairsSet,
-        CandidatePairCostSavings,
-        PairableInsts, FixedOrderPairs, PairConnectionTypes,
-        ConnectedPairs, ConnectedPairDeps,
-        PairableInstUsers, ChosenPairs);
-
-      if (ChosenPairs.empty()) continue;
-      AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
-                              PairableInsts.end());
-      AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
-
-      // Only for the chosen pairs, propagate information on fixed-order pairs,
-      // pair connections, and their types to the data structures used by the
-      // pair fusion procedures.
-      for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
-           IE = ChosenPairs.end(); I != IE; ++I) {
-        if (FixedOrderPairs.count(*I))
-          AllFixedOrderPairs.insert(*I);
-        else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
-          AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
-
-        for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
-             J != IE; ++J) {
-          DenseMap<VPPair, unsigned>::iterator K =
-            PairConnectionTypes.find(VPPair(*I, *J));
-          if (K != PairConnectionTypes.end()) {
-            AllPairConnectionTypes.insert(*K);
-          } else {
-            K = PairConnectionTypes.find(VPPair(*J, *I));
-            if (K != PairConnectionTypes.end())
-              AllPairConnectionTypes.insert(*K);
-          }
-        }
-      }
-
-      for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator
-           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
-           I != IE; ++I)
-        for (std::vector<ValuePair>::iterator J = I->second.begin(),
-          JE = I->second.end(); J != JE; ++J)
-          if (AllPairConnectionTypes.count(VPPair(I->first, *J))) {
-            AllConnectedPairs[I->first].push_back(*J);
-            AllConnectedPairDeps[*J].push_back(I->first);
-          }
-    } while (ShouldContinue);
-
-    if (AllChosenPairs.empty()) return false;
-    NumFusedOps += AllChosenPairs.size();
-
-    // A set of pairs has now been selected. It is now necessary to replace the
-    // paired instructions with vector instructions. For this procedure each
-    // operand must be replaced with a vector operand. This vector is formed
-    // by using build_vector on the old operands. The replaced values are then
-    // replaced with a vector_extract on the result.  Subsequent optimization
-    // passes should coalesce the build/extract combinations.
-
-    fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
-                    AllPairConnectionTypes,
-                    AllConnectedPairs, AllConnectedPairDeps);
-
-    // It is important to cleanup here so that future iterations of this
-    // function have less work to do.
-    (void)SimplifyInstructionsInBlock(&BB, TLI);
-    return true;
-  }
-
-  // This function returns true if the provided instruction is capable of being
-  // fused into a vector instruction. This determination is based only on the
-  // type and other attributes of the instruction.
-  bool BBVectorize::isInstVectorizable(Instruction *I,
-                                         bool &IsSimpleLoadStore) {
-    IsSimpleLoadStore = false;
-
-    if (CallInst *C = dyn_cast<CallInst>(I)) {
-      if (!isVectorizableIntrinsic(C))
-        return false;
-    } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
-      // Vectorize simple loads if possbile:
-      IsSimpleLoadStore = L->isSimple();
-      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
-        return false;
-    } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
-      // Vectorize simple stores if possbile:
-      IsSimpleLoadStore = S->isSimple();
-      if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
-        return false;
-    } else if (CastInst *C = dyn_cast<CastInst>(I)) {
-      // We can vectorize casts, but not casts of pointer types, etc.
-      if (!Config.VectorizeCasts)
-        return false;
-
-      Type *SrcTy = C->getSrcTy();
-      if (!SrcTy->isSingleValueType())
-        return false;
-
-      Type *DestTy = C->getDestTy();
-      if (!DestTy->isSingleValueType())
-        return false;
-    } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
-      if (!Config.VectorizeSelect)
-        return false;
-      // We can vectorize a select if either all operands are scalars,
-      // or all operands are vectors. Trying to "widen" a select between
-      // vectors that has a scalar condition results in a malformed select.
-      // FIXME: We could probably be smarter about this by rewriting the select
-      // with different types instead.
-      return (SI->getCondition()->getType()->isVectorTy() ==
-              SI->getTrueValue()->getType()->isVectorTy());
-    } else if (isa<CmpInst>(I)) {
-      if (!Config.VectorizeCmp)
-        return false;
-    } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) {
-      if (!Config.VectorizeGEP)
-        return false;
-
-      // Currently, vector GEPs exist only with one index.
-      if (G->getNumIndices() != 1)
-        return false;
-    } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
-        isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
-      return false;
-    }
-
-    Type *T1, *T2;
-    getInstructionTypes(I, T1, T2);
-
-    // Not every type can be vectorized...
-    if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
-        !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
-      return false;
-
-    if (T1->getScalarSizeInBits() == 1) {
-      if (!Config.VectorizeBools)
-        return false;
-    } else {
-      if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
-        return false;
-    }
-
-    if (T2->getScalarSizeInBits() == 1) {
-      if (!Config.VectorizeBools)
-        return false;
-    } else {
-      if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
-        return false;
-    }
-
-    if (!Config.VectorizeFloats
-        && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
-      return false;
-
-    // Don't vectorize target-specific types.
-    if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy())
-      return false;
-    if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
-      return false;
-
-    if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() ||
-                                      T2->getScalarType()->isPointerTy()))
-      return false;
-
-    if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
-                 T2->getPrimitiveSizeInBits() >= Config.VectorBits))
-      return false;
-
-    return true;
-  }
-
-  // This function returns true if the two provided instructions are compatible
-  // (meaning that they can be fused into a vector instruction). This assumes
-  // that I has already been determined to be vectorizable and that J is not
-  // in the use dag of I.
-  bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore, bool NonPow2Len,
-                       int &CostSavings, int &FixedOrder) {
-    DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
-                     " <-> " << *J << "\n");
-
-    CostSavings = 0;
-    FixedOrder = 0;
-
-    // Loads and stores can be merged if they have different alignments,
-    // but are otherwise the same.
-    if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
-                      (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
-      return false;
-
-    Type *IT1, *IT2, *JT1, *JT2;
-    getInstructionTypes(I, IT1, IT2);
-    getInstructionTypes(J, JT1, JT2);
-    unsigned MaxTypeBits = std::max(
-      IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
-      IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
-    if (!TTI && MaxTypeBits > Config.VectorBits)
-      return false;
-
-    // FIXME: handle addsub-type operations!
-
-    if (IsSimpleLoadStore) {
-      Value *IPtr, *JPtr;
-      unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
-      int64_t OffsetInElmts = 0;
-      if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
-                         IAddressSpace, JAddressSpace, OffsetInElmts) &&
-          std::abs(OffsetInElmts) == 1) {
-        FixedOrder = (int) OffsetInElmts;
-        unsigned BottomAlignment = IAlignment;
-        if (OffsetInElmts < 0) BottomAlignment = JAlignment;
-
-        Type *aTypeI = isa<StoreInst>(I) ?
-          cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
-        Type *aTypeJ = isa<StoreInst>(J) ?
-          cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
-        Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
-
-        if (Config.AlignedOnly) {
-          // An aligned load or store is possible only if the instruction
-          // with the lower offset has an alignment suitable for the
-          // vector type.
-          const DataLayout &DL = I->getModule()->getDataLayout();
-          unsigned VecAlignment = DL.getPrefTypeAlignment(VType);
-          if (BottomAlignment < VecAlignment)
-            return false;
-        }
-
-        if (TTI) {
-          unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI,
-                                                IAlignment, IAddressSpace);
-          unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ,
-                                                JAlignment, JAddressSpace);
-          unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
-                                                BottomAlignment,
-                                                IAddressSpace);
-
-          ICost += TTI->getAddressComputationCost(aTypeI);
-          JCost += TTI->getAddressComputationCost(aTypeJ);
-          VCost += TTI->getAddressComputationCost(VType);
-
-          if (VCost > ICost + JCost)
-            return false;
-
-          // We don't want to fuse to a type that will be split, even
-          // if the two input types will also be split and there is no other
-          // associated cost.
-          unsigned VParts = TTI->getNumberOfParts(VType);
-          if (VParts > 1)
-            return false;
-          else if (!VParts && VCost == ICost + JCost)
-            return false;
-
-          CostSavings = ICost + JCost - VCost;
-        }
-      } else {
-        return false;
-      }
-    } else if (TTI) {
-      TargetTransformInfo::OperandValueKind Op1VK =
-          TargetTransformInfo::OK_AnyValue;
-      TargetTransformInfo::OperandValueKind Op2VK =
-          TargetTransformInfo::OK_AnyValue;
-      unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2, Op1VK, Op2VK, I);
-      unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2, Op1VK, Op2VK, J);
-      Type *VT1 = getVecTypeForPair(IT1, JT1),
-           *VT2 = getVecTypeForPair(IT2, JT2);
-
-      // On some targets (example X86) the cost of a vector shift may vary
-      // depending on whether the second operand is a Uniform or
-      // NonUniform Constant.
-      switch (I->getOpcode()) {
-      default : break;
-      case Instruction::Shl:
-      case Instruction::LShr:
-      case Instruction::AShr:
-
-        // If both I and J are scalar shifts by constant, then the
-        // merged vector shift count would be either a constant splat value
-        // or a non-uniform vector of constants.
-        if (ConstantInt *CII = dyn_cast<ConstantInt>(I->getOperand(1))) {
-          if (ConstantInt *CIJ = dyn_cast<ConstantInt>(J->getOperand(1)))
-            Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue :
-                               TargetTransformInfo::OK_NonUniformConstantValue;
-        } else {
-          // Check for a splat of a constant or for a non uniform vector
-          // of constants.
-          Value *IOp = I->getOperand(1);
-          Value *JOp = J->getOperand(1);
-          if ((isa<ConstantVector>(IOp) || isa<ConstantDataVector>(IOp)) &&
-              (isa<ConstantVector>(JOp) || isa<ConstantDataVector>(JOp))) {
-            Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
-            Constant *SplatValue = cast<Constant>(IOp)->getSplatValue();
-            if (SplatValue != nullptr &&
-                SplatValue == cast<Constant>(JOp)->getSplatValue())
-              Op2VK = TargetTransformInfo::OK_UniformConstantValue;
-          }
-        }
-      }
-
-      // Note that this procedure is incorrect for insert and extract element
-      // instructions (because combining these often results in a shuffle),
-      // but this cost is ignored (because insert and extract element
-      // instructions are assigned a zero depth factor and are not really
-      // fused in general).
-      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK, I);
-
-      if (VCost > ICost + JCost)
-        return false;
-
-      // We don't want to fuse to a type that will be split, even
-      // if the two input types will also be split and there is no other
-      // associated cost.
-      unsigned VParts1 = TTI->getNumberOfParts(VT1),
-               VParts2 = TTI->getNumberOfParts(VT2);
-      if (VParts1 > 1 || VParts2 > 1)
-        return false;
-      else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
-        return false;
-
-      CostSavings = ICost + JCost - VCost;
-    }
-
-    // The powi,ctlz,cttz intrinsics are special because only the first
-    // argument is vectorized, the second arguments must be equal.
-    CallInst *CI = dyn_cast<CallInst>(I);
-    Function *FI;
-    if (CI && (FI = CI->getCalledFunction())) {
-      Intrinsic::ID IID = FI->getIntrinsicID();
-      if (IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
-          IID == Intrinsic::cttz) {
-        Value *A1I = CI->getArgOperand(1),
-              *A1J = cast<CallInst>(J)->getArgOperand(1);
-        const SCEV *A1ISCEV = SE->getSCEV(A1I),
-                   *A1JSCEV = SE->getSCEV(A1J);
-        return (A1ISCEV == A1JSCEV);
-      }
-
-      if (IID && TTI) {
-        FastMathFlags FMFCI;
-        if (auto *FPMOCI = dyn_cast<FPMathOperator>(CI))
-          FMFCI = FPMOCI->getFastMathFlags();
-        SmallVector<Value *, 4> IArgs(CI->arg_operands());
-        unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, IArgs, FMFCI);
-
-        CallInst *CJ = cast<CallInst>(J);
-
-        FastMathFlags FMFCJ;
-        if (auto *FPMOCJ = dyn_cast<FPMathOperator>(CJ))
-          FMFCJ = FPMOCJ->getFastMathFlags();
-
-        SmallVector<Value *, 4> JArgs(CJ->arg_operands());
-        unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, JArgs, FMFCJ);
-
-        assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
-               "Intrinsic argument counts differ");
-        SmallVector<Type*, 4> Tys;
-        SmallVector<Value *, 4> VecArgs;
-        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-          if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
-               IID == Intrinsic::cttz) && i == 1) {
-            Tys.push_back(CI->getArgOperand(i)->getType());
-            VecArgs.push_back(CI->getArgOperand(i));
-          }
-          else {
-            Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
-                                            CJ->getArgOperand(i)->getType()));
-            // Add both operands, and then count their scalarization overhead
-            // with VF 1.
-            VecArgs.push_back(CI->getArgOperand(i));
-            VecArgs.push_back(CJ->getArgOperand(i));
-          }
-        }
-
-        // Compute the scalarization cost here with the original operands (to
-        // check for uniqueness etc), and then call getIntrinsicInstrCost()
-        // with the constructed vector types.
-        Type *RetTy = getVecTypeForPair(IT1, JT1);
-        unsigned ScalarizationCost = 0;
-        if (!RetTy->isVoidTy())
-          ScalarizationCost += TTI->getScalarizationOverhead(RetTy, true, false);
-        ScalarizationCost += TTI->getOperandsScalarizationOverhead(VecArgs, 1);
-
-        FastMathFlags FMFV = FMFCI;
-        FMFV &= FMFCJ;
-        unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV,
-                                                    ScalarizationCost);
-
-        if (VCost > ICost + JCost)
-          return false;
-
-        // We don't want to fuse to a type that will be split, even
-        // if the two input types will also be split and there is no other
-        // associated cost.
-        unsigned RetParts = TTI->getNumberOfParts(RetTy);
-        if (RetParts > 1)
-          return false;
-        else if (!RetParts && VCost == ICost + JCost)
-          return false;
-
-        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-          if (!Tys[i]->isVectorTy())
-            continue;
-
-          unsigned NumParts = TTI->getNumberOfParts(Tys[i]);
-          if (NumParts > 1)
-            return false;
-          else if (!NumParts && VCost == ICost + JCost)
-            return false;
-        }
-
-        CostSavings = ICost + JCost - VCost;
-      }
-    }
-
-    return true;
-  }
-
-  // Figure out whether or not J uses I and update the users and write-set
-  // structures associated with I. Specifically, Users represents the set of
-  // instructions that depend on I. WriteSet represents the set
-  // of memory locations that are dependent on I. If UpdateUsers is true,
-  // and J uses I, then Users is updated to contain J and WriteSet is updated
-  // to contain any memory locations to which J writes. The function returns
-  // true if J uses I. By default, alias analysis is used to determine
-  // whether J reads from memory that overlaps with a location in WriteSet.
-  // If LoadMoveSet is not null, then it is a previously-computed map
-  // where the key is the memory-based user instruction and the value is
-  // the instruction to be compared with I. So, if LoadMoveSet is provided,
-  // then the alias analysis is not used. This is necessary because this
-  // function is called during the process of moving instructions during
-  // vectorization and the results of the alias analysis are not stable during
-  // that process.
-  bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
-                       AliasSetTracker &WriteSet, Instruction *I,
-                       Instruction *J, bool UpdateUsers,
-                       DenseSet<ValuePair> *LoadMoveSetPairs) {
-    bool UsesI = false;
-
-    // This instruction may already be marked as a user due, for example, to
-    // being a member of a selected pair.
-    if (Users.count(J))
-      UsesI = true;
-
-    if (!UsesI)
-      for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
-           JU != JE; ++JU) {
-        Value *V = *JU;
-        if (I == V || Users.count(V)) {
-          UsesI = true;
-          break;
-        }
-      }
-    if (!UsesI && J->mayReadFromMemory()) {
-      if (LoadMoveSetPairs) {
-        UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
-      } else {
-        for (AliasSetTracker::iterator W = WriteSet.begin(),
-             WE = WriteSet.end(); W != WE; ++W) {
-          if (W->aliasesUnknownInst(J, *AA)) {
-            UsesI = true;
-            break;
-          }
-        }
-      }
-    }
-
-    if (UsesI && UpdateUsers) {
-      if (J->mayWriteToMemory()) WriteSet.add(J);
-      Users.insert(J);
-    }
-
-    return UsesI;
-  }
-
-  // This function iterates over all instruction pairs in the provided
-  // basic block and collects all candidate pairs for vectorization.
-  bool BBVectorize::getCandidatePairs(BasicBlock &BB,
-                       BasicBlock::iterator &Start,
-                       DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                       DenseSet<ValuePair> &FixedOrderPairs,
-                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                       std::vector<Value *> &PairableInsts, bool NonPow2Len) {
-    size_t TotalPairs = 0;
-    BasicBlock::iterator E = BB.end();
-    if (Start == E) return false;
-
-    bool ShouldContinue = false, IAfterStart = false;
-    for (BasicBlock::iterator I = Start++; I != E; ++I) {
-      if (I == Start) IAfterStart = true;
-
-      bool IsSimpleLoadStore;
-      if (!isInstVectorizable(&*I, IsSimpleLoadStore))
-        continue;
-
-      // Look for an instruction with which to pair instruction *I...
-      DenseSet<Value *> Users;
-      AliasSetTracker WriteSet(*AA);
-      if (I->mayWriteToMemory())
-        WriteSet.add(&*I);
-
-      bool JAfterStart = IAfterStart;
-      BasicBlock::iterator J = std::next(I);
-      for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
-        if (J == Start)
-          JAfterStart = true;
-
-        // Determine if J uses I, if so, exit the loop.
-        bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep);
-        if (Config.FastDep) {
-          // Note: For this heuristic to be effective, independent operations
-          // must tend to be intermixed. This is likely to be true from some
-          // kinds of grouped loop unrolling (but not the generic LLVM pass),
-          // but otherwise may require some kind of reordering pass.
-
-          // When using fast dependency analysis,
-          // stop searching after first use:
-          if (UsesI) break;
-        } else {
-          if (UsesI) continue;
-        }
-
-        // J does not use I, and comes before the first use of I, so it can be
-        // merged with I if the instructions are compatible.
-        int CostSavings, FixedOrder;
-        if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len,
-                                CostSavings, FixedOrder))
-          continue;
-
-        // J is a candidate for merging with I.
-        if (PairableInsts.empty() ||
-            PairableInsts[PairableInsts.size() - 1] != &*I) {
-          PairableInsts.push_back(&*I);
-        }
-
-        CandidatePairs[&*I].push_back(&*J);
-        ++TotalPairs;
-        if (TTI)
-          CandidatePairCostSavings.insert(
-              ValuePairWithCost(ValuePair(&*I, &*J), CostSavings));
-
-        if (FixedOrder == 1)
-          FixedOrderPairs.insert(ValuePair(&*I, &*J));
-        else if (FixedOrder == -1)
-          FixedOrderPairs.insert(ValuePair(&*J, &*I));
-
-        // The next call to this function must start after the last instruction
-        // selected during this invocation.
-        if (JAfterStart) {
-          Start = std::next(J);
-          IAfterStart = JAfterStart = false;
-        }
-
-        DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
-                     << *I << " <-> " << *J << " (cost savings: " <<
-                     CostSavings << ")\n");
-
-        // If we have already found too many pairs, break here and this function
-        // will be called again starting after the last instruction selected
-        // during this invocation.
-        if (PairableInsts.size() >= Config.MaxInsts ||
-            TotalPairs >= Config.MaxPairs) {
-          ShouldContinue = true;
-          break;
-        }
-      }
-
-      if (ShouldContinue)
-        break;
-    }
-
-    DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
-           << " instructions with candidate pairs\n");
-
-    return ShouldContinue;
-  }
-
-  // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
-  // it looks for pairs such that both members have an input which is an
-  // output of PI or PJ.
-  void BBVectorize::computePairsConnectedTo(
-                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                  DenseSet<ValuePair> &CandidatePairsSet,
-                  std::vector<Value *> &PairableInsts,
-                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                  DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                  ValuePair P) {
-    StoreInst *SI, *SJ;
-
-    // For each possible pairing for this variable, look at the uses of
-    // the first value...
-    for (Value::user_iterator I = P.first->user_begin(),
-                              E = P.first->user_end();
-         I != E; ++I) {
-      User *UI = *I;
-      if (isa<LoadInst>(UI)) {
-        // A pair cannot be connected to a load because the load only takes one
-        // operand (the address) and it is a scalar even after vectorization.
-        continue;
-      } else if ((SI = dyn_cast<StoreInst>(UI)) &&
-                 P.first == SI->getPointerOperand()) {
-        // Similarly, a pair cannot be connected to a store through its
-        // pointer operand.
-        continue;
-      }
-
-      // For each use of the first variable, look for uses of the second
-      // variable...
-      for (User *UJ : P.second->users()) {
-        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
-            P.second == SJ->getPointerOperand())
-          continue;
-
-        // Look for <I, J>:
-        if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
-          VPPair VP(P, ValuePair(UI, UJ));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
-        }
-
-        // Look for <J, I>:
-        if (CandidatePairsSet.count(ValuePair(UJ, UI))) {
-          VPPair VP(P, ValuePair(UJ, UI));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
-        }
-      }
-
-      if (Config.SplatBreaksChain) continue;
-      // Look for cases where just the first value in the pair is used by
-      // both members of another pair (splatting).
-      for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) {
-        User *UJ = *J;
-        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
-            P.first == SJ->getPointerOperand())
-          continue;
-
-        if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
-          VPPair VP(P, ValuePair(UI, UJ));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
-        }
-      }
-    }
-
-    if (Config.SplatBreaksChain) return;
-    // Look for cases where just the second value in the pair is used by
-    // both members of another pair (splatting).
-    for (Value::user_iterator I = P.second->user_begin(),
-                              E = P.second->user_end();
-         I != E; ++I) {
-      User *UI = *I;
-      if (isa<LoadInst>(UI))
-        continue;
-      else if ((SI = dyn_cast<StoreInst>(UI)) &&
-               P.second == SI->getPointerOperand())
-        continue;
-
-      for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) {
-        User *UJ = *J;
-        if ((SJ = dyn_cast<StoreInst>(UJ)) &&
-            P.second == SJ->getPointerOperand())
-          continue;
-
-        if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
-          VPPair VP(P, ValuePair(UI, UJ));
-          ConnectedPairs[VP.first].push_back(VP.second);
-          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
-        }
-      }
-    }
-  }
-
-  // This function figures out which pairs are connected.  Two pairs are
-  // connected if some output of the first pair forms an input to both members
-  // of the second pair.
-  void BBVectorize::computeConnectedPairs(
-                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                  DenseSet<ValuePair> &CandidatePairsSet,
-                  std::vector<Value *> &PairableInsts,
-                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                  DenseMap<VPPair, unsigned> &PairConnectionTypes) {
-    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
-         PE = PairableInsts.end(); PI != PE; ++PI) {
-      DenseMap<Value *, std::vector<Value *> >::iterator PP =
-        CandidatePairs.find(*PI);
-      if (PP == CandidatePairs.end())
-        continue;
-
-      for (std::vector<Value *>::iterator P = PP->second.begin(),
-           E = PP->second.end(); P != E; ++P)
-        computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
-                                PairableInsts, ConnectedPairs,
-                                PairConnectionTypes, ValuePair(*PI, *P));
-    }
-
-    DEBUG(size_t TotalPairs = 0;
-          for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I =
-               ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
-            TotalPairs += I->second.size();
-          dbgs() << "BBV: found " << TotalPairs
-                 << " pair connections.\n");
-  }
-
-  // This function builds a set of use tuples such that <A, B> is in the set
-  // if B is in the use dag of A. If B is in the use dag of A, then B
-  // depends on the output of A.
-  void BBVectorize::buildDepMap(
-                      BasicBlock &BB,
-                      DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                      std::vector<Value *> &PairableInsts,
-                      DenseSet<ValuePair> &PairableInstUsers) {
-    DenseSet<Value *> IsInPair;
-    for (DenseMap<Value *, std::vector<Value *> >::iterator C =
-         CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) {
-      IsInPair.insert(C->first);
-      IsInPair.insert(C->second.begin(), C->second.end());
-    }
-
-    // Iterate through the basic block, recording all users of each
-    // pairable instruction.
-
-    BasicBlock::iterator E = BB.end(), EL =
-      BasicBlock::iterator(cast<Instruction>(PairableInsts.back()));
-    for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
-      if (IsInPair.find(&*I) == IsInPair.end())
-        continue;
-
-      DenseSet<Value *> Users;
-      AliasSetTracker WriteSet(*AA);
-      if (I->mayWriteToMemory())
-        WriteSet.add(&*I);
-
-      for (BasicBlock::iterator J = std::next(I); J != E; ++J) {
-        (void)trackUsesOfI(Users, WriteSet, &*I, &*J);
-
-        if (J == EL)
-          break;
-      }
-
-      for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
-           U != E; ++U) {
-        if (IsInPair.find(*U) == IsInPair.end()) continue;
-        PairableInstUsers.insert(ValuePair(&*I, *U));
-      }
-
-      if (I == EL)
-        break;
-    }
-  }
-
-  // Returns true if an input to pair P is an output of pair Q and also an
-  // input of pair Q is an output of pair P. If this is the case, then these
-  // two pairs cannot be simultaneously fused.
-  bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
-             DenseSet<ValuePair> &PairableInstUsers,
-             DenseMap<ValuePair, std::vector<ValuePair> > *PairableInstUserMap,
-             DenseSet<VPPair> *PairableInstUserPairSet) {
-    // Two pairs are in conflict if they are mutual Users of eachother.
-    bool QUsesP = PairableInstUsers.count(ValuePair(P.first,  Q.first))  ||
-                  PairableInstUsers.count(ValuePair(P.first,  Q.second)) ||
-                  PairableInstUsers.count(ValuePair(P.second, Q.first))  ||
-                  PairableInstUsers.count(ValuePair(P.second, Q.second));
-    bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first,  P.first))  ||
-                  PairableInstUsers.count(ValuePair(Q.first,  P.second)) ||
-                  PairableInstUsers.count(ValuePair(Q.second, P.first))  ||
-                  PairableInstUsers.count(ValuePair(Q.second, P.second));
-    if (PairableInstUserMap) {
-      // FIXME: The expensive part of the cycle check is not so much the cycle
-      // check itself but this edge insertion procedure. This needs some
-      // profiling and probably a different data structure.
-      if (PUsesQ) {
-        if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
-          (*PairableInstUserMap)[Q].push_back(P);
-      }
-      if (QUsesP) {
-        if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
-          (*PairableInstUserMap)[P].push_back(Q);
-      }
-    }
-
-    return (QUsesP && PUsesQ);
-  }
-
-  // This function walks the use graph of current pairs to see if, starting
-  // from P, the walk returns to P.
-  bool BBVectorize::pairWillFormCycle(ValuePair P,
-             DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-             DenseSet<ValuePair> &CurrentPairs) {
-    DEBUG(if (DebugCycleCheck)
-            dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
-                   << *P.second << "\n");
-    // A lookup table of visisted pairs is kept because the PairableInstUserMap
-    // contains non-direct associations.
-    DenseSet<ValuePair> Visited;
-    SmallVector<ValuePair, 32> Q;
-    // General depth-first post-order traversal:
-    Q.push_back(P);
-    do {
-      ValuePair QTop = Q.pop_back_val();
-      Visited.insert(QTop);
-
-      DEBUG(if (DebugCycleCheck)
-              dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
-                     << *QTop.second << "\n");
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
-        PairableInstUserMap.find(QTop);
-      if (QQ == PairableInstUserMap.end())
-        continue;
-
-      for (std::vector<ValuePair>::iterator C = QQ->second.begin(),
-           CE = QQ->second.end(); C != CE; ++C) {
-        if (*C == P) {
-          DEBUG(dbgs()
-                 << "BBV: rejected to prevent non-trivial cycle formation: "
-                 << QTop.first << " <-> " << C->second << "\n");
-          return true;
-        }
-
-        if (CurrentPairs.count(*C) && !Visited.count(*C))
-          Q.push_back(*C);
-      }
-    } while (!Q.empty());
-
-    return false;
-  }
-
-  // This function builds the initial dag of connected pairs with the
-  // pair J at the root.
-  void BBVectorize::buildInitialDAGFor(
-                  DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                  DenseSet<ValuePair> &CandidatePairsSet,
-                  std::vector<Value *> &PairableInsts,
-                  DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                  DenseSet<ValuePair> &PairableInstUsers,
-                  DenseMap<Value *, Value *> &ChosenPairs,
-                  DenseMap<ValuePair, size_t> &DAG, ValuePair J) {
-    // Each of these pairs is viewed as the root node of a DAG. The DAG
-    // is then walked (depth-first). As this happens, we keep track of
-    // the pairs that compose the DAG and the maximum depth of the DAG.
-    SmallVector<ValuePairWithDepth, 32> Q;
-    // General depth-first post-order traversal:
-    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
-    do {
-      ValuePairWithDepth QTop = Q.back();
-
-      // Push each child onto the queue:
-      bool MoreChildren = false;
-      size_t MaxChildDepth = QTop.second;
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
-        ConnectedPairs.find(QTop.first);
-      if (QQ != ConnectedPairs.end())
-        for (std::vector<ValuePair>::iterator k = QQ->second.begin(),
-             ke = QQ->second.end(); k != ke; ++k) {
-          // Make sure that this child pair is still a candidate:
-          if (CandidatePairsSet.count(*k)) {
-            DenseMap<ValuePair, size_t>::iterator C = DAG.find(*k);
-            if (C == DAG.end()) {
-              size_t d = getDepthFactor(k->first);
-              Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
-              MoreChildren = true;
-            } else {
-              MaxChildDepth = std::max(MaxChildDepth, C->second);
-            }
-          }
-        }
-
-      if (!MoreChildren) {
-        // Record the current pair as part of the DAG:
-        DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
-        Q.pop_back();
-      }
-    } while (!Q.empty());
-  }
-
-  // Given some initial dag, prune it by removing conflicting pairs (pairs
-  // that cannot be simultaneously chosen for vectorization).
-  void BBVectorize::pruneDAGFor(
-              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-              std::vector<Value *> &PairableInsts,
-              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-              DenseSet<ValuePair> &PairableInstUsers,
-              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-              DenseSet<VPPair> &PairableInstUserPairSet,
-              DenseMap<Value *, Value *> &ChosenPairs,
-              DenseMap<ValuePair, size_t> &DAG,
-              DenseSet<ValuePair> &PrunedDAG, ValuePair J,
-              bool UseCycleCheck) {
-    SmallVector<ValuePairWithDepth, 32> Q;
-    // General depth-first post-order traversal:
-    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
-    do {
-      ValuePairWithDepth QTop = Q.pop_back_val();
-      PrunedDAG.insert(QTop.first);
-
-      // Visit each child, pruning as necessary...
-      SmallVector<ValuePairWithDepth, 8> BestChildren;
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator QQ =
-        ConnectedPairs.find(QTop.first);
-      if (QQ == ConnectedPairs.end())
-        continue;
-
-      for (std::vector<ValuePair>::iterator K = QQ->second.begin(),
-           KE = QQ->second.end(); K != KE; ++K) {
-        DenseMap<ValuePair, size_t>::iterator C = DAG.find(*K);
-        if (C == DAG.end()) continue;
-
-        // This child is in the DAG, now we need to make sure it is the
-        // best of any conflicting children. There could be multiple
-        // conflicting children, so first, determine if we're keeping
-        // this child, then delete conflicting children as necessary.
-
-        // It is also necessary to guard against pairing-induced
-        // dependencies. Consider instructions a .. x .. y .. b
-        // such that (a,b) are to be fused and (x,y) are to be fused
-        // but a is an input to x and b is an output from y. This
-        // means that y cannot be moved after b but x must be moved
-        // after b for (a,b) to be fused. In other words, after
-        // fusing (a,b) we have y .. a/b .. x where y is an input
-        // to a/b and x is an output to a/b: x and y can no longer
-        // be legally fused. To prevent this condition, we must
-        // make sure that a child pair added to the DAG is not
-        // both an input and output of an already-selected pair.
-
-        // Pairing-induced dependencies can also form from more complicated
-        // cycles. The pair vs. pair conflicts are easy to check, and so
-        // that is done explicitly for "fast rejection", and because for
-        // child vs. child conflicts, we may prefer to keep the current
-        // pair in preference to the already-selected child.
-        DenseSet<ValuePair> CurrentPairs;
-
-        bool CanAdd = true;
-        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
-              = BestChildren.begin(), E2 = BestChildren.end();
-             C2 != E2; ++C2) {
-          if (C2->first.first == C->first.first ||
-              C2->first.first == C->first.second ||
-              C2->first.second == C->first.first ||
-              C2->first.second == C->first.second ||
-              pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            if (C2->second >= C->second) {
-              CanAdd = false;
-              break;
-            }
-
-            CurrentPairs.insert(C2->first);
-          }
-        }
-        if (!CanAdd) continue;
-
-        // Even worse, this child could conflict with another node already
-        // selected for the DAG. If that is the case, ignore this child.
-        for (DenseSet<ValuePair>::iterator T = PrunedDAG.begin(),
-             E2 = PrunedDAG.end(); T != E2; ++T) {
-          if (T->first == C->first.first ||
-              T->first == C->first.second ||
-              T->second == C->first.first ||
-              T->second == C->first.second ||
-              pairsConflict(*T, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            CanAdd = false;
-            break;
-          }
-
-          CurrentPairs.insert(*T);
-        }
-        if (!CanAdd) continue;
-
-        // And check the queue too...
-        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(),
-             E2 = Q.end(); C2 != E2; ++C2) {
-          if (C2->first.first == C->first.first ||
-              C2->first.first == C->first.second ||
-              C2->first.second == C->first.first ||
-              C2->first.second == C->first.second ||
-              pairsConflict(C2->first, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            CanAdd = false;
-            break;
-          }
-
-          CurrentPairs.insert(C2->first);
-        }
-        if (!CanAdd) continue;
-
-        // Last but not least, check for a conflict with any of the
-        // already-chosen pairs.
-        for (DenseMap<Value *, Value *>::iterator C2 =
-              ChosenPairs.begin(), E2 = ChosenPairs.end();
-             C2 != E2; ++C2) {
-          if (pairsConflict(*C2, C->first, PairableInstUsers,
-                            UseCycleCheck ? &PairableInstUserMap : nullptr,
-                            UseCycleCheck ? &PairableInstUserPairSet
-                                          : nullptr)) {
-            CanAdd = false;
-            break;
-          }
-
-          CurrentPairs.insert(*C2);
-        }
-        if (!CanAdd) continue;
-
-        // To check for non-trivial cycles formed by the addition of the
-        // current pair we've formed a list of all relevant pairs, now use a
-        // graph walk to check for a cycle. We start from the current pair and
-        // walk the use dag to see if we again reach the current pair. If we
-        // do, then the current pair is rejected.
-
-        // FIXME: It may be more efficient to use a topological-ordering
-        // algorithm to improve the cycle check. This should be investigated.
-        if (UseCycleCheck &&
-            pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
-          continue;
-
-        // This child can be added, but we may have chosen it in preference
-        // to an already-selected child. Check for this here, and if a
-        // conflict is found, then remove the previously-selected child
-        // before adding this one in its place.
-        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
-              = BestChildren.begin(); C2 != BestChildren.end();) {
-          if (C2->first.first == C->first.first ||
-              C2->first.first == C->first.second ||
-              C2->first.second == C->first.first ||
-              C2->first.second == C->first.second ||
-              pairsConflict(C2->first, C->first, PairableInstUsers))
-            C2 = BestChildren.erase(C2);
-          else
-            ++C2;
-        }
-
-        BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
-      }
-
-      for (SmallVectorImpl<ValuePairWithDepth>::iterator C
-            = BestChildren.begin(), E2 = BestChildren.end();
-           C != E2; ++C) {
-        size_t DepthF = getDepthFactor(C->first.first);
-        Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
-      }
-    } while (!Q.empty());
-  }
-
-  // This function finds the best dag of mututally-compatible connected
-  // pairs, given the choice of root pairs as an iterator range.
-  void BBVectorize::findBestDAGFor(
-              DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-              DenseSet<ValuePair> &CandidatePairsSet,
-              DenseMap<ValuePair, int> &CandidatePairCostSavings,
-              std::vector<Value *> &PairableInsts,
-              DenseSet<ValuePair> &FixedOrderPairs,
-              DenseMap<VPPair, unsigned> &PairConnectionTypes,
-              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-              DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-              DenseSet<ValuePair> &PairableInstUsers,
-              DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap,
-              DenseSet<VPPair> &PairableInstUserPairSet,
-              DenseMap<Value *, Value *> &ChosenPairs,
-              DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth,
-              int &BestEffSize, Value *II, std::vector<Value *>&JJ,
-              bool UseCycleCheck) {
-    for (std::vector<Value *>::iterator J = JJ.begin(), JE = JJ.end();
-         J != JE; ++J) {
-      ValuePair IJ(II, *J);
-      if (!CandidatePairsSet.count(IJ))
-        continue;
-
-      // Before going any further, make sure that this pair does not
-      // conflict with any already-selected pairs (see comment below
-      // near the DAG pruning for more details).
-      DenseSet<ValuePair> ChosenPairSet;
-      bool DoesConflict = false;
-      for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
-           E = ChosenPairs.end(); C != E; ++C) {
-        if (pairsConflict(*C, IJ, PairableInstUsers,
-                          UseCycleCheck ? &PairableInstUserMap : nullptr,
-                          UseCycleCheck ? &PairableInstUserPairSet : nullptr)) {
-          DoesConflict = true;
-          break;
-        }
-
-        ChosenPairSet.insert(*C);
-      }
-      if (DoesConflict) continue;
-
-      if (UseCycleCheck &&
-          pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
-        continue;
-
-      DenseMap<ValuePair, size_t> DAG;
-      buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
-                          PairableInsts, ConnectedPairs,
-                          PairableInstUsers, ChosenPairs, DAG, IJ);
-
-      // Because we'll keep the child with the largest depth, the largest
-      // depth is still the same in the unpruned DAG.
-      size_t MaxDepth = DAG.lookup(IJ);
-
-      DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
-                   << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
-                   MaxDepth << " and size " << DAG.size() << "\n");
-
-      // At this point the DAG has been constructed, but, may contain
-      // contradictory children (meaning that different children of
-      // some dag node may be attempting to fuse the same instruction).
-      // So now we walk the dag again, in the case of a conflict,
-      // keep only the child with the largest depth. To break a tie,
-      // favor the first child.
-
-      DenseSet<ValuePair> PrunedDAG;
-      pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
-                   PairableInstUsers, PairableInstUserMap,
-                   PairableInstUserPairSet,
-                   ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
-
-      int EffSize = 0;
-      if (TTI) {
-        DenseSet<Value *> PrunedDAGInstrs;
-        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
-             E = PrunedDAG.end(); S != E; ++S) {
-          PrunedDAGInstrs.insert(S->first);
-          PrunedDAGInstrs.insert(S->second);
-        }
-
-        // The set of pairs that have already contributed to the total cost.
-        DenseSet<ValuePair> IncomingPairs;
-
-        // If the cost model were perfect, this might not be necessary; but we
-        // need to make sure that we don't get stuck vectorizing our own
-        // shuffle chains.
-        bool HasNontrivialInsts = false;
-
-        // The node weights represent the cost savings associated with
-        // fusing the pair of instructions.
-        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
-             E = PrunedDAG.end(); S != E; ++S) {
-          if (!isa<ShuffleVectorInst>(S->first) &&
-              !isa<InsertElementInst>(S->first) &&
-              !isa<ExtractElementInst>(S->first))
-            HasNontrivialInsts = true;
-
-          bool FlipOrder = false;
-
-          if (getDepthFactor(S->first)) {
-            int ESContrib = CandidatePairCostSavings.find(*S)->second;
-            DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
-                   << *S->first << " <-> " << *S->second << "} = " <<
-                   ESContrib << "\n");
-            EffSize += ESContrib;
-          }
-
-          // The edge weights contribute in a negative sense: they represent
-          // the cost of shuffles.
-          DenseMap<ValuePair, std::vector<ValuePair> >::iterator SS =
-            ConnectedPairDeps.find(*S);
-          if (SS != ConnectedPairDeps.end()) {
-            unsigned NumDepsDirect = 0, NumDepsSwap = 0;
-            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
-                 TE = SS->second.end(); T != TE; ++T) {
-              VPPair Q(*S, *T);
-              if (!PrunedDAG.count(Q.second))
-                continue;
-              DenseMap<VPPair, unsigned>::iterator R =
-                PairConnectionTypes.find(VPPair(Q.second, Q.first));
-              assert(R != PairConnectionTypes.end() &&
-                     "Cannot find pair connection type");
-              if (R->second == PairConnectionDirect)
-                ++NumDepsDirect;
-              else if (R->second == PairConnectionSwap)
-                ++NumDepsSwap;
-            }
-
-            // If there are more swaps than direct connections, then
-            // the pair order will be flipped during fusion. So the real
-            // number of swaps is the minimum number.
-            FlipOrder = !FixedOrderPairs.count(*S) &&
-              ((NumDepsSwap > NumDepsDirect) ||
-                FixedOrderPairs.count(ValuePair(S->second, S->first)));
-
-            for (std::vector<ValuePair>::iterator T = SS->second.begin(),
-                 TE = SS->second.end(); T != TE; ++T) {
-              VPPair Q(*S, *T);
-              if (!PrunedDAG.count(Q.second))
-                continue;
-              DenseMap<VPPair, unsigned>::iterator R =
-                PairConnectionTypes.find(VPPair(Q.second, Q.first));
-              assert(R != PairConnectionTypes.end() &&
-                     "Cannot find pair connection type");
-              Type *Ty1 = Q.second.first->getType(),
-                   *Ty2 = Q.second.second->getType();
-              Type *VTy = getVecTypeForPair(Ty1, Ty2);
-              if ((R->second == PairConnectionDirect && FlipOrder) ||
-                  (R->second == PairConnectionSwap && !FlipOrder)  ||
-                  R->second == PairConnectionSplat) {
-                int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                                   VTy, VTy);
-
-                if (VTy->getVectorNumElements() == 2) {
-                  if (R->second == PairConnectionSplat)
-                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                      TargetTransformInfo::SK_Broadcast, VTy));
-                  else
-                    ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                      TargetTransformInfo::SK_Reverse, VTy));
-                }
-
-                DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
-                  *Q.second.first << " <-> " << *Q.second.second <<
-                    "} -> {" <<
-                  *S->first << " <-> " << *S->second << "} = " <<
-                   ESContrib << "\n");
-                EffSize -= ESContrib;
-              }
-            }
-          }
-
-          // Compute the cost of outgoing edges. We assume that edges outgoing
-          // to shuffles, inserts or extracts can be merged, and so contribute
-          // no additional cost.
-          if (!S->first->getType()->isVoidTy()) {
-            Type *Ty1 = S->first->getType(),
-                 *Ty2 = S->second->getType();
-            Type *VTy = getVecTypeForPair(Ty1, Ty2);
-
-            bool NeedsExtraction = false;
-            for (User *U : S->first->users()) {
-              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
-                // Shuffle can be folded if it has no other input
-                if (isa<UndefValue>(SI->getOperand(1)))
-                  continue;
-              }
-              if (isa<ExtractElementInst>(U))
-                continue;
-              if (PrunedDAGInstrs.count(U))
-                continue;
-              NeedsExtraction = true;
-              break;
-            }
-
-            if (NeedsExtraction) {
-              int ESContrib;
-              if (Ty1->isVectorTy()) {
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               Ty1, VTy);
-                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                  TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1));
-              } else
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::ExtractElement, VTy, 0);
-
-              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
-                *S->first << "} = " << ESContrib << "\n");
-              EffSize -= ESContrib;
-            }
-
-            NeedsExtraction = false;
-            for (User *U : S->second->users()) {
-              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(U)) {
-                // Shuffle can be folded if it has no other input
-                if (isa<UndefValue>(SI->getOperand(1)))
-                  continue;
-              }
-              if (isa<ExtractElementInst>(U))
-                continue;
-              if (PrunedDAGInstrs.count(U))
-                continue;
-              NeedsExtraction = true;
-              break;
-            }
-
-            if (NeedsExtraction) {
-              int ESContrib;
-              if (Ty2->isVectorTy()) {
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               Ty2, VTy);
-                ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                  TargetTransformInfo::SK_ExtractSubvector, VTy,
-                  Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2));
-              } else
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::ExtractElement, VTy, 1);
-              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
-                *S->second << "} = " << ESContrib << "\n");
-              EffSize -= ESContrib;
-            }
-          }
-
-          // Compute the cost of incoming edges.
-          if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
-            Instruction *S1 = cast<Instruction>(S->first),
-                        *S2 = cast<Instruction>(S->second);
-            for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
-              Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
-
-              // Combining constants into vector constants (or small vector
-              // constants into larger ones are assumed free).
-              if (isa<Constant>(O1) && isa<Constant>(O2))
-                continue;
-
-              if (FlipOrder)
-                std::swap(O1, O2);
-
-              ValuePair VP  = ValuePair(O1, O2);
-              ValuePair VPR = ValuePair(O2, O1);
-
-              // Internal edges are not handled here.
-              if (PrunedDAG.count(VP) || PrunedDAG.count(VPR))
-                continue;
-
-              Type *Ty1 = O1->getType(),
-                   *Ty2 = O2->getType();
-              Type *VTy = getVecTypeForPair(Ty1, Ty2);
-
-              // Combining vector operations of the same type is also assumed
-              // folded with other operations.
-              if (Ty1 == Ty2) {
-                // If both are insert elements, then both can be widened.
-                InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1),
-                                  *IEO2 = dyn_cast<InsertElementInst>(O2);
-                if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
-                  continue;
-                // If both are extract elements, and both have the same input
-                // type, then they can be replaced with a shuffle
-                ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1),
-                                   *EIO2 = dyn_cast<ExtractElementInst>(O2);
-                if (EIO1 && EIO2 &&
-                    EIO1->getOperand(0)->getType() ==
-                      EIO2->getOperand(0)->getType())
-                  continue;
-                // If both are a shuffle with equal operand types and only two
-                // unqiue operands, then they can be replaced with a single
-                // shuffle
-                ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1),
-                                  *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
-                if (SIO1 && SIO2 &&
-                    SIO1->getOperand(0)->getType() ==
-                      SIO2->getOperand(0)->getType()) {
-                  SmallSet<Value *, 4> SIOps;
-                  SIOps.insert(SIO1->getOperand(0));
-                  SIOps.insert(SIO1->getOperand(1));
-                  SIOps.insert(SIO2->getOperand(0));
-                  SIOps.insert(SIO2->getOperand(1));
-                  if (SIOps.size() <= 2)
-                    continue;
-                }
-              }
-
-              int ESContrib;
-              // This pair has already been formed.
-              if (IncomingPairs.count(VP)) {
-                continue;
-              } else if (IncomingPairs.count(VPR)) {
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               VTy, VTy);
-
-                if (VTy->getVectorNumElements() == 2)
-                  ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
-                    TargetTransformInfo::SK_Reverse, VTy));
-              } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::InsertElement, VTy, 0);
-                ESContrib += (int) TTI->getVectorInstrCost(
-                                     Instruction::InsertElement, VTy, 1);
-              } else if (!Ty1->isVectorTy()) {
-                // O1 needs to be inserted into a vector of size O2, and then
-                // both need to be shuffled together.
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::InsertElement, Ty2, 0);
-                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
-                                                VTy, Ty2);
-              } else if (!Ty2->isVectorTy()) {
-                // O2 needs to be inserted into a vector of size O1, and then
-                // both need to be shuffled together.
-                ESContrib = (int) TTI->getVectorInstrCost(
-                                    Instruction::InsertElement, Ty1, 0);
-                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
-                                                VTy, Ty1);
-              } else {
-                Type *TyBig = Ty1, *TySmall = Ty2;
-                if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
-                  std::swap(TyBig, TySmall);
-
-                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
-                                               VTy, TyBig);
-                if (TyBig != TySmall)
-                  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
-                                                  TyBig, TySmall);
-              }
-
-              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
-                     << *O1 << " <-> " << *O2 << "} = " <<
-                     ESContrib << "\n");
-              EffSize -= ESContrib;
-              IncomingPairs.insert(VP);
-            }
-          }
-        }
-
-        if (!HasNontrivialInsts) {
-          DEBUG(if (DebugPairSelection) dbgs() <<
-                "\tNo non-trivial instructions in DAG;"
-                " override to zero effective size\n");
-          EffSize = 0;
-        }
-      } else {
-        for (DenseSet<ValuePair>::iterator S = PrunedDAG.begin(),
-             E = PrunedDAG.end(); S != E; ++S)
-          EffSize += (int) getDepthFactor(S->first);
-      }
-
-      DEBUG(if (DebugPairSelection)
-             dbgs() << "BBV: found pruned DAG for pair {"
-             << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
-             MaxDepth << " and size " << PrunedDAG.size() <<
-            " (effective size: " << EffSize << ")\n");
-      if (((TTI && !UseChainDepthWithTI) ||
-            MaxDepth >= Config.ReqChainDepth) &&
-          EffSize > 0 && EffSize > BestEffSize) {
-        BestMaxDepth = MaxDepth;
-        BestEffSize = EffSize;
-        BestDAG = PrunedDAG;
-      }
-    }
-  }
-
-  // Given the list of candidate pairs, this function selects those
-  // that will be fused into vector instructions.
-  void BBVectorize::choosePairs(
-                DenseMap<Value *, std::vector<Value *> > &CandidatePairs,
-                DenseSet<ValuePair> &CandidatePairsSet,
-                DenseMap<ValuePair, int> &CandidatePairCostSavings,
-                std::vector<Value *> &PairableInsts,
-                DenseSet<ValuePair> &FixedOrderPairs,
-                DenseMap<VPPair, unsigned> &PairConnectionTypes,
-                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-                DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps,
-                DenseSet<ValuePair> &PairableInstUsers,
-                DenseMap<Value *, Value *>& ChosenPairs) {
-    bool UseCycleCheck =
-     CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
-
-    DenseMap<Value *, std::vector<Value *> > CandidatePairs2;
-    for (DenseSet<ValuePair>::iterator I = CandidatePairsSet.begin(),
-         E = CandidatePairsSet.end(); I != E; ++I) {
-      std::vector<Value *> &JJ = CandidatePairs2[I->second];
-      if (JJ.empty()) JJ.reserve(32);
-      JJ.push_back(I->first);
-    }
-
-    DenseMap<ValuePair, std::vector<ValuePair> > PairableInstUserMap;
-    DenseSet<VPPair> PairableInstUserPairSet;
-    for (std::vector<Value *>::iterator I = PairableInsts.begin(),
-         E = PairableInsts.end(); I != E; ++I) {
-      // The number of possible pairings for this variable:
-      size_t NumChoices = CandidatePairs.lookup(*I).size();
-      if (!NumChoices) continue;
-
-      std::vector<Value *> &JJ = CandidatePairs[*I];
-
-      // The best pair to choose and its dag:
-      size_t BestMaxDepth = 0;
-      int BestEffSize = 0;
-      DenseSet<ValuePair> BestDAG;
-      findBestDAGFor(CandidatePairs, CandidatePairsSet,
-                      CandidatePairCostSavings,
-                      PairableInsts, FixedOrderPairs, PairConnectionTypes,
-                      ConnectedPairs, ConnectedPairDeps,
-                      PairableInstUsers, PairableInstUserMap,
-                      PairableInstUserPairSet, ChosenPairs,
-                      BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
-                      UseCycleCheck);
-
-      if (BestDAG.empty())
-        continue;
-
-      // A dag has been chosen (or not) at this point. If no dag was
-      // chosen, then this instruction, I, cannot be paired (and is no longer
-      // considered).
-
-      DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "
-                   << *cast<Instruction>(*I) << "\n");
-
-      for (DenseSet<ValuePair>::iterator S = BestDAG.begin(),
-           SE2 = BestDAG.end(); S != SE2; ++S) {
-        // Insert the members of this dag into the list of chosen pairs.
-        ChosenPairs.insert(ValuePair(S->first, S->second));
-        DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
-               *S->second << "\n");
-
-        // Remove all candidate pairs that have values in the chosen dag.
-        std::vector<Value *> &KK = CandidatePairs[S->first];
-        for (std::vector<Value *>::iterator K = KK.begin(), KE = KK.end();
-             K != KE; ++K) {
-          if (*K == S->second)
-            continue;
-
-          CandidatePairsSet.erase(ValuePair(S->first, *K));
-        }
-
-        std::vector<Value *> &LL = CandidatePairs2[S->second];
-        for (std::vector<Value *>::iterator L = LL.begin(), LE = LL.end();
-             L != LE; ++L) {
-          if (*L == S->first)
-            continue;
-
-          CandidatePairsSet.erase(ValuePair(*L, S->second));
-        }
-
-        std::vector<Value *> &MM = CandidatePairs[S->second];
-        for (std::vector<Value *>::iterator M = MM.begin(), ME = MM.end();
-             M != ME; ++M) {
-          assert(*M != S->first && "Flipped pair in candidate list?");
-          CandidatePairsSet.erase(ValuePair(S->second, *M));
-        }
-
-        std::vector<Value *> &NN = CandidatePairs2[S->first];
-        for (std::vector<Value *>::iterator N = NN.begin(), NE = NN.end();
-             N != NE; ++N) {
-          assert(*N != S->second && "Flipped pair in candidate list?");
-          CandidatePairsSet.erase(ValuePair(*N, S->first));
-        }
-      }
-    }
-
-    DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
-  }
-
-  std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
-                     unsigned n = 0) {
-    if (!I->hasName())
-      return "";
-
-    return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
-             (n > 0 ? "." + utostr(n) : "")).str();
-  }
-
-  // Returns the value that is to be used as the pointer input to the vector
-  // instruction that fuses I with J.
-  Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
-                     Instruction *I, Instruction *J, unsigned o) {
-    Value *IPtr, *JPtr;
-    unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
-    int64_t OffsetInElmts;
-
-    // Note: the analysis might fail here, that is why the pair order has
-    // been precomputed (OffsetInElmts must be unused here).
-    (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
-                          IAddressSpace, JAddressSpace,
-                          OffsetInElmts, false);
-
-    // The pointer value is taken to be the one with the lowest offset.
-    Value *VPtr = IPtr;
-
-    Type *ArgTypeI = IPtr->getType()->getPointerElementType();
-    Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
-    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-    Type *VArgPtrType
-      = PointerType::get(VArgType,
-                         IPtr->getType()->getPointerAddressSpace());
-    return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
-                        /* insert before */ I);
-  }
-
-  void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
-                     unsigned MaskOffset, unsigned NumInElem,
-                     unsigned NumInElem1, unsigned IdxOffset,
-                     std::vector<Constant*> &Mask) {
-    unsigned NumElem1 = J->getType()->getVectorNumElements();
-    for (unsigned v = 0; v < NumElem1; ++v) {
-      int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
-      if (m < 0) {
-        Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
-      } else {
-        unsigned mm = m + (int) IdxOffset;
-        if (m >= (int) NumInElem1)
-          mm += (int) NumInElem;
-
-        Mask[v+MaskOffset] =
-          ConstantInt::get(Type::getInt32Ty(Context), mm);
-      }
-    }
-  }
-
-  // Returns the value that is to be used as the vector-shuffle mask to the
-  // vector instruction that fuses I with J.
-  Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
-                     Instruction *I, Instruction *J) {
-    // This is the shuffle mask. We need to append the second
-    // mask to the first, and the numbers need to be adjusted.
-
-    Type *ArgTypeI = I->getType();
-    Type *ArgTypeJ = J->getType();
-    Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-
-    unsigned NumElemI = ArgTypeI->getVectorNumElements();
-
-    // Get the total number of elements in the fused vector type.
-    // By definition, this must equal the number of elements in
-    // the final mask.
-    unsigned NumElem = VArgType->getVectorNumElements();
-    std::vector<Constant*> Mask(NumElem);
-
-    Type *OpTypeI = I->getOperand(0)->getType();
-    unsigned NumInElemI = OpTypeI->getVectorNumElements();
-    Type *OpTypeJ = J->getOperand(0)->getType();
-    unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
-
-    // The fused vector will be:
-    // -----------------------------------------------------
-    // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
-    // -----------------------------------------------------
-    // from which we'll extract NumElem total elements (where the first NumElemI
-    // of them come from the mask in I and the remainder come from the mask
-    // in J.
-
-    // For the mask from the first pair...
-    fillNewShuffleMask(Context, I, 0,        NumInElemJ, NumInElemI,
-                       0,          Mask);
-
-    // For the mask from the second pair...
-    fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
-                       NumInElemI, Mask);
-
-    return ConstantVector::get(Mask);
-  }
-
-  bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
-                                  Instruction *J, unsigned o, Value *&LOp,
-                                  unsigned numElemL,
-                                  Type *ArgTypeL, Type *ArgTypeH,
-                                  bool IBeforeJ, unsigned IdxOff) {
-    bool ExpandedIEChain = false;
-    if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
-      // If we have a pure insertelement chain, then this can be rewritten
-      // into a chain that directly builds the larger type.
-      if (isPureIEChain(LIE)) {
-        SmallVector<Value *, 8> VectElemts(numElemL,
-          UndefValue::get(ArgTypeL->getScalarType()));
-        InsertElementInst *LIENext = LIE;
-        do {
-          unsigned Idx =
-            cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue();
-          VectElemts[Idx] = LIENext->getOperand(1);
-        } while ((LIENext =
-                   dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
-
-        LIENext = nullptr;
-        Value *LIEPrev = UndefValue::get(ArgTypeH);
-        for (unsigned i = 0; i < numElemL; ++i) {
-          if (isa<UndefValue>(VectElemts[i])) continue;
-          LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
-                             ConstantInt::get(Type::getInt32Ty(Context),
-                                              i + IdxOff),
-                             getReplacementName(IBeforeJ ? I : J,
-                                                true, o, i+1));
-          LIENext->insertBefore(IBeforeJ ? J : I);
-          LIEPrev = LIENext;
-        }
-
-        LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
-        ExpandedIEChain = true;
-      }
-    }
-
-    return ExpandedIEChain;
-  }
-
-  static unsigned getNumScalarElements(Type *Ty) {
-    if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
-      return VecTy->getNumElements();
-    return 1;
-  }
-
-  // Returns the value to be used as the specified operand of the vector
-  // instruction that fuses I with J.
-  Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o, bool IBeforeJ) {
-    Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
-    Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
-
-    // Compute the fused vector type for this operand
-    Type *ArgTypeI = I->getOperand(o)->getType();
-    Type *ArgTypeJ = J->getOperand(o)->getType();
-    VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-
-    Instruction *L = I, *H = J;
-    Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
-
-    unsigned numElemL = getNumScalarElements(ArgTypeL);
-    unsigned numElemH = getNumScalarElements(ArgTypeH);
-
-    Value *LOp = L->getOperand(o);
-    Value *HOp = H->getOperand(o);
-    unsigned numElem = VArgType->getNumElements();
-
-    // First, we check if we can reuse the "original" vector outputs (if these
-    // exist). We might need a shuffle.
-    ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp);
-    ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp);
-    ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp);
-    ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp);
-
-    // FIXME: If we're fusing shuffle instructions, then we can't apply this
-    // optimization. The input vectors to the shuffle might be a different
-    // length from the shuffle outputs. Unfortunately, the replacement
-    // shuffle mask has already been formed, and the mask entries are sensitive
-    // to the sizes of the inputs.
-    bool IsSizeChangeShuffle =
-      isa<ShuffleVectorInst>(L) &&
-        (LOp->getType() != L->getType() || HOp->getType() != H->getType());
-
-    if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
-      // We can have at most two unique vector inputs.
-      bool CanUseInputs = true;
-      Value *I1, *I2 = nullptr;
-      if (LEE) {
-        I1 = LEE->getOperand(0);
-      } else {
-        I1 = LSV->getOperand(0);
-        I2 = LSV->getOperand(1);
-        if (I2 == I1 || isa<UndefValue>(I2))
-          I2 = nullptr;
-      }
-
-      if (HEE) {
-        Value *I3 = HEE->getOperand(0);
-        if (!I2 && I3 != I1)
-          I2 = I3;
-        else if (I3 != I1 && I3 != I2)
-          CanUseInputs = false;
-      } else {
-        Value *I3 = HSV->getOperand(0);
-        if (!I2 && I3 != I1)
-          I2 = I3;
-        else if (I3 != I1 && I3 != I2)
-          CanUseInputs = false;
-
-        if (CanUseInputs) {
-          Value *I4 = HSV->getOperand(1);
-          if (!isa<UndefValue>(I4)) {
-            if (!I2 && I4 != I1)
-              I2 = I4;
-            else if (I4 != I1 && I4 != I2)
-              CanUseInputs = false;
-          }
-        }
-      }
-
-      if (CanUseInputs) {
-        unsigned LOpElem =
-          cast<Instruction>(LOp)->getOperand(0)->getType()
-            ->getVectorNumElements();
-
-        unsigned HOpElem =
-          cast<Instruction>(HOp)->getOperand(0)->getType()
-            ->getVectorNumElements();
-
-        // We have one or two input vectors. We need to map each index of the
-        // operands to the index of the original vector.
-        SmallVector<std::pair<int, int>, 8>  II(numElem);
-        for (unsigned i = 0; i < numElemL; ++i) {
-          int Idx, INum;
-          if (LEE) {
-            Idx =
-              cast<ConstantInt>(LEE->getOperand(1))->getSExtValue();
-            INum = LEE->getOperand(0) == I1 ? 0 : 1;
-          } else {
-            Idx = LSV->getMaskValue(i);
-            if (Idx < (int) LOpElem) {
-              INum = LSV->getOperand(0) == I1 ? 0 : 1;
-            } else {
-              Idx -= LOpElem;
-              INum = LSV->getOperand(1) == I1 ? 0 : 1;
-            }
-          }
-
-          II[i] = std::pair<int, int>(Idx, INum);
-        }
-        for (unsigned i = 0; i < numElemH; ++i) {
-          int Idx, INum;
-          if (HEE) {
-            Idx =
-              cast<ConstantInt>(HEE->getOperand(1))->getSExtValue();
-            INum = HEE->getOperand(0) == I1 ? 0 : 1;
-          } else {
-            Idx = HSV->getMaskValue(i);
-            if (Idx < (int) HOpElem) {
-              INum = HSV->getOperand(0) == I1 ? 0 : 1;
-            } else {
-              Idx -= HOpElem;
-              INum = HSV->getOperand(1) == I1 ? 0 : 1;
-            }
-          }
-
-          II[i + numElemL] = std::pair<int, int>(Idx, INum);
-        }
-
-        // We now have an array which tells us from which index of which
-        // input vector each element of the operand comes.
-        VectorType *I1T = cast<VectorType>(I1->getType());
-        unsigned I1Elem = I1T->getNumElements();
-
-        if (!I2) {
-          // In this case there is only one underlying vector input. Check for
-          // the trivial case where we can use the input directly.
-          if (I1Elem == numElem) {
-            bool ElemInOrder = true;
-            for (unsigned i = 0; i < numElem; ++i) {
-              if (II[i].first != (int) i && II[i].first != -1) {
-                ElemInOrder = false;
-                break;
-              }
-            }
-
-            if (ElemInOrder)
-              return I1;
-          }
-
-          // A shuffle is needed.
-          std::vector<Constant *> Mask(numElem);
-          for (unsigned i = 0; i < numElem; ++i) {
-            int Idx = II[i].first;
-            if (Idx == -1)
-              Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
-            else
-              Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
-          }
-
-          Instruction *S =
-            new ShuffleVectorInst(I1, UndefValue::get(I1T),
-                                  ConstantVector::get(Mask),
-                                  getReplacementName(IBeforeJ ? I : J,
-                                                     true, o));
-          S->insertBefore(IBeforeJ ? J : I);
-          return S;
-        }
-
-        VectorType *I2T = cast<VectorType>(I2->getType());
-        unsigned I2Elem = I2T->getNumElements();
-
-        // This input comes from two distinct vectors. The first step is to
-        // make sure that both vectors are the same length. If not, the
-        // smaller one will need to grow before they can be shuffled together.
-        if (I1Elem < I2Elem) {
-          std::vector<Constant *> Mask(I2Elem);
-          unsigned v = 0;
-          for (; v < I1Elem; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < I2Elem; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-
-          Instruction *NewI1 =
-            new ShuffleVectorInst(I1, UndefValue::get(I1T),
-                                  ConstantVector::get(Mask),
-                                  getReplacementName(IBeforeJ ? I : J,
-                                                     true, o, 1));
-          NewI1->insertBefore(IBeforeJ ? J : I);
-          I1 = NewI1;
-          I1Elem = I2Elem;
-        } else if (I1Elem > I2Elem) {
-          std::vector<Constant *> Mask(I1Elem);
-          unsigned v = 0;
-          for (; v < I2Elem; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < I1Elem; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-
-          Instruction *NewI2 =
-            new ShuffleVectorInst(I2, UndefValue::get(I2T),
-                                  ConstantVector::get(Mask),
-                                  getReplacementName(IBeforeJ ? I : J,
-                                                     true, o, 1));
-          NewI2->insertBefore(IBeforeJ ? J : I);
-          I2 = NewI2;
-        }
-
-        // Now that both I1 and I2 are the same length we can shuffle them
-        // together (and use the result).
-        std::vector<Constant *> Mask(numElem);
-        for (unsigned v = 0; v < numElem; ++v) {
-          if (II[v].first == -1) {
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-          } else {
-            int Idx = II[v].first + II[v].second * I1Elem;
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
-          }
-        }
-
-        Instruction *NewOp =
-          new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
-                                getReplacementName(IBeforeJ ? I : J, true, o));
-        NewOp->insertBefore(IBeforeJ ? J : I);
-        return NewOp;
-      }
-    }
-
-    Type *ArgType = ArgTypeL;
-    if (numElemL < numElemH) {
-      if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
-                                         ArgTypeL, VArgType, IBeforeJ, 1)) {
-        // This is another short-circuit case: we're combining a scalar into
-        // a vector that is formed by an IE chain. We've just expanded the IE
-        // chain, now insert the scalar and we're done.
-
-        Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
-                           getReplacementName(IBeforeJ ? I : J, true, o));
-        S->insertBefore(IBeforeJ ? J : I);
-        return S;
-      } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
-                                ArgTypeH, IBeforeJ)) {
-        // The two vector inputs to the shuffle must be the same length,
-        // so extend the smaller vector to be the same length as the larger one.
-        Instruction *NLOp;
-        if (numElemL > 1) {
-
-          std::vector<Constant *> Mask(numElemH);
-          unsigned v = 0;
-          for (; v < numElemL; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < numElemH; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-
-          NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
-                                       ConstantVector::get(Mask),
-                                       getReplacementName(IBeforeJ ? I : J,
-                                                          true, o, 1));
-        } else {
-          NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
-                                           getReplacementName(IBeforeJ ? I : J,
-                                                              true, o, 1));
-        }
-
-        NLOp->insertBefore(IBeforeJ ? J : I);
-        LOp = NLOp;
-      }
-
-      ArgType = ArgTypeH;
-    } else if (numElemL > numElemH) {
-      if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
-                                         ArgTypeH, VArgType, IBeforeJ)) {
-        Instruction *S =
-          InsertElementInst::Create(LOp, HOp,
-                                    ConstantInt::get(Type::getInt32Ty(Context),
-                                                     numElemL),
-                                    getReplacementName(IBeforeJ ? I : J,
-                                                       true, o));
-        S->insertBefore(IBeforeJ ? J : I);
-        return S;
-      } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
-                                ArgTypeL, IBeforeJ)) {
-        Instruction *NHOp;
-        if (numElemH > 1) {
-          std::vector<Constant *> Mask(numElemL);
-          unsigned v = 0;
-          for (; v < numElemH; ++v)
-            Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-          for (; v < numElemL; ++v)
-            Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-
-          NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
-                                       ConstantVector::get(Mask),
-                                       getReplacementName(IBeforeJ ? I : J,
-                                                          true, o, 1));
-        } else {
-          NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
-                                           getReplacementName(IBeforeJ ? I : J,
-                                                              true, o, 1));
-        }
-
-        NHOp->insertBefore(IBeforeJ ? J : I);
-        HOp = NHOp;
-      }
-    }
-
-    if (ArgType->isVectorTy()) {
-      unsigned numElem = VArgType->getVectorNumElements();
-      std::vector<Constant*> Mask(numElem);
-      for (unsigned v = 0; v < numElem; ++v) {
-        unsigned Idx = v;
-        // If the low vector was expanded, we need to skip the extra
-        // undefined entries.
-        if (v >= numElemL && numElemH > numElemL)
-          Idx += (numElemH - numElemL);
-        Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
-      }
-
-      Instruction *BV = new ShuffleVectorInst(LOp, HOp,
-                          ConstantVector::get(Mask),
-                          getReplacementName(IBeforeJ ? I : J, true, o));
-      BV->insertBefore(IBeforeJ ? J : I);
-      return BV;
-    }
-
-    Instruction *BV1 = InsertElementInst::Create(
-                                          UndefValue::get(VArgType), LOp, CV0,
-                                          getReplacementName(IBeforeJ ? I : J,
-                                                             true, o, 1));
-    BV1->insertBefore(IBeforeJ ? J : I);
-    Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
-                                          getReplacementName(IBeforeJ ? I : J,
-                                                             true, o, 2));
-    BV2->insertBefore(IBeforeJ ? J : I);
-    return BV2;
-  }
-
-  // This function creates an array of values that will be used as the inputs
-  // to the vector instruction that fuses I with J.
-  void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
-                     Instruction *I, Instruction *J,
-                     SmallVectorImpl<Value *> &ReplacedOperands,
-                     bool IBeforeJ) {
-    unsigned NumOperands = I->getNumOperands();
-
-    for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
-      // Iterate backward so that we look at the store pointer
-      // first and know whether or not we need to flip the inputs.
-
-      if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
-        // This is the pointer for a load/store instruction.
-        ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
-        continue;
-      } else if (isa<CallInst>(I)) {
-        Function *F = cast<CallInst>(I)->getCalledFunction();
-        Intrinsic::ID IID = F->getIntrinsicID();
-        if (o == NumOperands-1) {
-          BasicBlock &BB = *I->getParent();
-
-          Module *M = BB.getParent()->getParent();
-          Type *ArgTypeI = I->getType();
-          Type *ArgTypeJ = J->getType();
-          Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-
-          ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
-          continue;
-        } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
-                    IID == Intrinsic::cttz) && o == 1) {
-          // The second argument of powi/ctlz/cttz is a single integer/constant
-          // and we've already checked that both arguments are equal.
-          // As a result, we just keep I's second argument.
-          ReplacedOperands[o] = I->getOperand(o);
-          continue;
-        }
-      } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
-        ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
-        continue;
-      }
-
-      ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
-    }
-  }
-
-  // This function creates two values that represent the outputs of the
-  // original I and J instructions. These are generally vector shuffles
-  // or extracts. In many cases, these will end up being unused and, thus,
-  // eliminated by later passes.
-  void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, Instruction *K,
-                     Instruction *&InsertionPt,
-                     Instruction *&K1, Instruction *&K2) {
-    if (isa<StoreInst>(I))
-      return;
-
-    Type *IType = I->getType();
-    Type *JType = J->getType();
-
-    VectorType *VType = getVecTypeForPair(IType, JType);
-    unsigned numElem = VType->getNumElements();
-
-    unsigned numElemI = getNumScalarElements(IType);
-    unsigned numElemJ = getNumScalarElements(JType);
-
-    if (IType->isVectorTy()) {
-      std::vector<Constant *> Mask1(numElemI), Mask2(numElemI);
-      for (unsigned v = 0; v < numElemI; ++v) {
-        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v);
-      }
-
-      K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                 ConstantVector::get(Mask1),
-                                 getReplacementName(K, false, 1));
-    } else {
-      Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
-      K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1));
-    }
-
-    if (JType->isVectorTy()) {
-      std::vector<Constant *> Mask1(numElemJ), Mask2(numElemJ);
-      for (unsigned v = 0; v < numElemJ; ++v) {
-        Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
-        Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v);
-      }
-
-      K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                 ConstantVector::get(Mask2),
-                                 getReplacementName(K, false, 2));
-    } else {
-      Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1);
-      K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2));
-    }
-
-    K1->insertAfter(K);
-    K2->insertAfter(K1);
-    InsertionPt = K2;
-  }
-
-  // Move all uses of the function I (including pairing-induced uses) after J.
-  bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I, Instruction *J) {
-    // Skip to the first instruction past I.
-    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
-
-    DenseSet<Value *> Users;
-    AliasSetTracker WriteSet(*AA);
-    if (I->mayWriteToMemory()) WriteSet.add(I);
-
-    for (; cast<Instruction>(L) != J; ++L)
-      (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs);
-
-    assert(cast<Instruction>(L) == J &&
-      "Tracking has not proceeded far enough to check for dependencies");
-    // If J is now in the use set of I, then trackUsesOfI will return true
-    // and we have a dependency cycle (and the fusing operation must abort).
-    return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
-  }
-
-  // Move all uses of the function I (including pairing-induced uses) after J.
-  void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *&InsertionPt,
-                     Instruction *I, Instruction *J) {
-    // Skip to the first instruction past I.
-    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
-
-    DenseSet<Value *> Users;
-    AliasSetTracker WriteSet(*AA);
-    if (I->mayWriteToMemory()) WriteSet.add(I);
-
-    for (; cast<Instruction>(L) != J;) {
-      if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) {
-        // Move this instruction
-        Instruction *InstToMove = &*L++;
-
-        DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
-                        " to after " << *InsertionPt << "\n");
-        InstToMove->removeFromParent();
-        InstToMove->insertAfter(InsertionPt);
-        InsertionPt = InstToMove;
-      } else {
-        ++L;
-      }
-    }
-  }
-
-  // Collect all load instruction that are in the move set of a given first
-  // pair member.  These loads depend on the first instruction, I, and so need
-  // to be moved after J (the second instruction) when the pair is fused.
-  void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs,
-                     Instruction *I) {
-    // Skip to the first instruction past I.
-    BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
-
-    DenseSet<Value *> Users;
-    AliasSetTracker WriteSet(*AA);
-    if (I->mayWriteToMemory()) WriteSet.add(I);
-
-    // Note: We cannot end the loop when we reach J because J could be moved
-    // farther down the use chain by another instruction pairing. Also, J
-    // could be before I if this is an inverted input.
-    for (BasicBlock::iterator E = BB.end(); L != E; ++L) {
-      if (trackUsesOfI(Users, WriteSet, I, &*L)) {
-        if (L->mayReadFromMemory()) {
-          LoadMoveSet[&*L].push_back(I);
-          LoadMoveSetPairs.insert(ValuePair(&*L, I));
-        }
-      }
-    }
-  }
-
-  // In cases where both load/stores and the computation of their pointers
-  // are chosen for vectorization, we can end up in a situation where the
-  // aliasing analysis starts returning different query results as the
-  // process of fusing instruction pairs continues. Because the algorithm
-  // relies on finding the same use dags here as were found earlier, we'll
-  // need to precompute the necessary aliasing information here and then
-  // manually update it during the fusion process.
-  void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
-                     std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseMap<Value *, std::vector<Value *> > &LoadMoveSet,
-                     DenseSet<ValuePair> &LoadMoveSetPairs) {
-    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
-         PIE = PairableInsts.end(); PI != PIE; ++PI) {
-      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
-      if (P == ChosenPairs.end()) continue;
-
-      Instruction *I = cast<Instruction>(P->first);
-      collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
-                             LoadMoveSetPairs, I);
-    }
-  }
-
-  // This function fuses the chosen instruction pairs into vector instructions,
-  // taking care preserve any needed scalar outputs and, then, it reorders the
-  // remaining instructions as needed (users of the first member of the pair
-  // need to be moved to after the location of the second member of the pair
-  // because the vector instruction is inserted in the location of the pair's
-  // second member).
-  void BBVectorize::fuseChosenPairs(BasicBlock &BB,
-             std::vector<Value *> &PairableInsts,
-             DenseMap<Value *, Value *> &ChosenPairs,
-             DenseSet<ValuePair> &FixedOrderPairs,
-             DenseMap<VPPair, unsigned> &PairConnectionTypes,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs,
-             DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps) {
-    LLVMContext& Context = BB.getContext();
-
-    // During the vectorization process, the order of the pairs to be fused
-    // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
-    // list. After a pair is fused, the flipped pair is removed from the list.
-    DenseSet<ValuePair> FlippedPairs;
-    for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
-         E = ChosenPairs.end(); P != E; ++P)
-      FlippedPairs.insert(ValuePair(P->second, P->first));
-    for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
-         E = FlippedPairs.end(); P != E; ++P)
-      ChosenPairs.insert(*P);
-
-    DenseMap<Value *, std::vector<Value *> > LoadMoveSet;
-    DenseSet<ValuePair> LoadMoveSetPairs;
-    collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
-                       LoadMoveSet, LoadMoveSetPairs);
-
-    DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
-
-    for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
-      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(&*PI);
-      if (P == ChosenPairs.end()) {
-        ++PI;
-        continue;
-      }
-
-      if (getDepthFactor(P->first) == 0) {
-        // These instructions are not really fused, but are tracked as though
-        // they are. Any case in which it would be interesting to fuse them
-        // will be taken care of by InstCombine.
-        --NumFusedOps;
-        ++PI;
-        continue;
-      }
-
-      Instruction *I = cast<Instruction>(P->first),
-        *J = cast<Instruction>(P->second);
-
-      DEBUG(dbgs() << "BBV: fusing: " << *I <<
-             " <-> " << *J << "\n");
-
-      // Remove the pair and flipped pair from the list.
-      DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
-      assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
-      ChosenPairs.erase(FP);
-      ChosenPairs.erase(P);
-
-      if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) {
-        DEBUG(dbgs() << "BBV: fusion of: " << *I <<
-               " <-> " << *J <<
-               " aborted because of non-trivial dependency cycle\n");
-        --NumFusedOps;
-        ++PI;
-        continue;
-      }
-
-      // If the pair must have the other order, then flip it.
-      bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
-      if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
-        // This pair does not have a fixed order, and so we might want to
-        // flip it if that will yield fewer shuffles. We count the number
-        // of dependencies connected via swaps, and those directly connected,
-        // and flip the order if the number of swaps is greater.
-        bool OrigOrder = true;
-        DenseMap<ValuePair, std::vector<ValuePair> >::iterator IJ =
-          ConnectedPairDeps.find(ValuePair(I, J));
-        if (IJ == ConnectedPairDeps.end()) {
-          IJ = ConnectedPairDeps.find(ValuePair(J, I));
-          OrigOrder = false;
-        }
-
-        if (IJ != ConnectedPairDeps.end()) {
-          unsigned NumDepsDirect = 0, NumDepsSwap = 0;
-          for (std::vector<ValuePair>::iterator T = IJ->second.begin(),
-               TE = IJ->second.end(); T != TE; ++T) {
-            VPPair Q(IJ->first, *T);
-            DenseMap<VPPair, unsigned>::iterator R =
-              PairConnectionTypes.find(VPPair(Q.second, Q.first));
-            assert(R != PairConnectionTypes.end() &&
-                   "Cannot find pair connection type");
-            if (R->second == PairConnectionDirect)
-              ++NumDepsDirect;
-            else if (R->second == PairConnectionSwap)
-              ++NumDepsSwap;
-          }
-
-          if (!OrigOrder)
-            std::swap(NumDepsDirect, NumDepsSwap);
-
-          if (NumDepsSwap > NumDepsDirect) {
-            FlipPairOrder = true;
-            DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
-                            " <-> " << *J << "\n");
-          }
-        }
-      }
-
-      Instruction *L = I, *H = J;
-      if (FlipPairOrder)
-        std::swap(H, L);
-
-      // If the pair being fused uses the opposite order from that in the pair
-      // connection map, then we need to flip the types.
-      DenseMap<ValuePair, std::vector<ValuePair> >::iterator HL =
-        ConnectedPairs.find(ValuePair(H, L));
-      if (HL != ConnectedPairs.end())
-        for (std::vector<ValuePair>::iterator T = HL->second.begin(),
-             TE = HL->second.end(); T != TE; ++T) {
-          VPPair Q(HL->first, *T);
-          DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(Q);
-          assert(R != PairConnectionTypes.end() &&
-                 "Cannot find pair connection type");
-          if (R->second == PairConnectionDirect)
-            R->second = PairConnectionSwap;
-          else if (R->second == PairConnectionSwap)
-            R->second = PairConnectionDirect;
-        }
-
-      bool LBeforeH = !FlipPairOrder;
-      unsigned NumOperands = I->getNumOperands();
-      SmallVector<Value *, 3> ReplacedOperands(NumOperands);
-      getReplacementInputsForPair(Context, L, H, ReplacedOperands,
-                                  LBeforeH);
-
-      // Make a copy of the original operation, change its type to the vector
-      // type and replace its operands with the vector operands.
-      Instruction *K = L->clone();
-      if (L->hasName())
-        K->takeName(L);
-      else if (H->hasName())
-        K->takeName(H);
-
-      if (auto CS = CallSite(K)) {
-        SmallVector<Type *, 3> Tys;
-        FunctionType *Old = CS.getFunctionType();
-        unsigned NumOld = Old->getNumParams();
-        assert(NumOld <= ReplacedOperands.size());
-        for (unsigned i = 0; i != NumOld; ++i)
-          Tys.push_back(ReplacedOperands[i]->getType());
-        CS.mutateFunctionType(
-            FunctionType::get(getVecTypeForPair(L->getType(), H->getType()),
-                              Tys, Old->isVarArg()));
-      } else if (!isa<StoreInst>(K))
-        K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
-
-      unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
-                             LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
-                             LLVMContext::MD_invariant_group};
-      combineMetadata(K, H, KnownIDs);
-      K->andIRFlags(H);
-
-      for (unsigned o = 0; o < NumOperands; ++o)
-        K->setOperand(o, ReplacedOperands[o]);
-
-      K->insertAfter(J);
-
-      // Instruction insertion point:
-      Instruction *InsertionPt = K;
-      Instruction *K1 = nullptr, *K2 = nullptr;
-      replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
-
-      // The use dag of the first original instruction must be moved to after
-      // the location of the second instruction. The entire use dag of the
-      // first instruction is disjoint from the input dag of the second
-      // (by definition), and so commutes with it.
-
-      moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
-
-      if (!isa<StoreInst>(I)) {
-        L->replaceAllUsesWith(K1);
-        H->replaceAllUsesWith(K2);
-      }
-
-      // Instructions that may read from memory may be in the load move set.
-      // Once an instruction is fused, we no longer need its move set, and so
-      // the values of the map never need to be updated. However, when a load
-      // is fused, we need to merge the entries from both instructions in the
-      // pair in case those instructions were in the move set of some other
-      // yet-to-be-fused pair. The loads in question are the keys of the map.
-      if (I->mayReadFromMemory()) {
-        std::vector<ValuePair> NewSetMembers;
-        DenseMap<Value *, std::vector<Value *> >::iterator II =
-          LoadMoveSet.find(I);
-        if (II != LoadMoveSet.end())
-          for (std::vector<Value *>::iterator N = II->second.begin(),
-               NE = II->second.end(); N != NE; ++N)
-            NewSetMembers.push_back(ValuePair(K, *N));
-        DenseMap<Value *, std::vector<Value *> >::iterator JJ =
-          LoadMoveSet.find(J);
-        if (JJ != LoadMoveSet.end())
-          for (std::vector<Value *>::iterator N = JJ->second.begin(),
-               NE = JJ->second.end(); N != NE; ++N)
-            NewSetMembers.push_back(ValuePair(K, *N));
-        for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
-             AE = NewSetMembers.end(); A != AE; ++A) {
-          LoadMoveSet[A->first].push_back(A->second);
-          LoadMoveSetPairs.insert(*A);
-        }
-      }
-
-      // Before removing I, set the iterator to the next instruction.
-      PI = std::next(BasicBlock::iterator(I));
-      if (cast<Instruction>(PI) == J)
-        ++PI;
-
-      SE->forgetValue(I);
-      SE->forgetValue(J);
-      I->eraseFromParent();
-      J->eraseFromParent();
-
-      DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
-                                               BB << "\n");
-    }
-
-    DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
-  }
-}
-
-char BBVectorize::ID = 0;
-static const char bb_vectorize_name[] = "Basic-Block Vectorization";
-INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
-INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
-
-BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
-  return new BBVectorize(C);
-}
-
-bool
-llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
-  BBVectorize BBVectorizer(P, *BB.getParent(), C);
-  return BBVectorizer.vectorizeBB(BB);
-}
-
-//===----------------------------------------------------------------------===//
-VectorizeConfig::VectorizeConfig() {
-  VectorBits = ::VectorBits;
-  VectorizeBools = !::NoBools;
-  VectorizeInts = !::NoInts;
-  VectorizeFloats = !::NoFloats;
-  VectorizePointers = !::NoPointers;
-  VectorizeCasts = !::NoCasts;
-  VectorizeMath = !::NoMath;
-  VectorizeBitManipulations = !::NoBitManipulation;
-  VectorizeFMA = !::NoFMA;
-  VectorizeSelect = !::NoSelect;
-  VectorizeCmp = !::NoCmp;
-  VectorizeGEP = !::NoGEP;
-  VectorizeMemOps = !::NoMemOps;
-  AlignedOnly = ::AlignedOnly;
-  ReqChainDepth= ::ReqChainDepth;
-  SearchLimit = ::SearchLimit;
-  MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
-  SplatBreaksChain = ::SplatBreaksChain;
-  MaxInsts = ::MaxInsts;
-  MaxPairs = ::MaxPairs;
-  MaxIter = ::MaxIter;
-  Pow2LenOnly = ::Pow2LenOnly;
-  NoMemOpBoost = ::NoMemOpBoost;
-  FastDep = ::FastDep;
-}
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index 395f440bda4..1aea73cd4a3 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_llvm_library(LLVMVectorize
-  BBVectorize.cpp
   LoadStoreVectorizer.cpp
   LoopVectorize.cpp
   SLPVectorizer.cpp
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index a2192831788..fb2f509dcba 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -26,7 +26,6 @@ using namespace llvm;
 /// initializeVectorizationPasses - Initialize all passes linked into the
 /// Vectorization library.
 void llvm::initializeVectorization(PassRegistry &Registry) {
-  initializeBBVectorizePass(Registry);
   initializeLoopVectorizePass(Registry);
   initializeSLPVectorizerPass(Registry);
   initializeLoadStoreVectorizerPass(Registry);
@@ -36,8 +35,8 @@ void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
   initializeVectorization(*unwrap(R));
 }
 
+// DEPRECATED: Remove after the LLVM 5 release.
 void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createBBVectorizePass());
 }
 
 void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
diff --git a/test/Feature/optnone-opt.ll b/test/Feature/optnone-opt.ll
index efd35e56603..6410afb6be9 100644
--- a/test/Feature/optnone-opt.ll
+++ b/test/Feature/optnone-opt.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -O1 -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-O1
 ; RUN: opt -O2 -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-O1 --check-prefix=OPT-O2O3
 ; RUN: opt -O3 -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-O1 --check-prefix=OPT-O2O3
-; RUN: opt -bb-vectorize -dce -die -gvn-hoist -loweratomic -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-MORE
+; RUN: opt -dce -die -gvn-hoist -loweratomic -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-MORE
 ; RUN: opt -indvars -licm -loop-deletion -loop-extract -loop-idiom -loop-instsimplify -loop-reduce -loop-reroll -loop-rotate -loop-unroll -loop-unswitch -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-LOOP
 
 ; REQUIRES: asserts
@@ -55,7 +55,6 @@ attributes #0 = { optnone noinline }
 ; OPT-O2O3-DAG: Skipping pass 'SLP Vectorizer'
 
 ; Additional IR passes that opt doesn't turn on by default.
-; OPT-MORE-DAG: Skipping pass 'Basic-Block Vectorization'
 ; OPT-MORE-DAG: Skipping pass 'Dead Code Elimination'
 ; OPT-MORE-DAG: Skipping pass 'Dead Instruction Elimination'
 ; OPT-MORE-DAG: Skipping pass 'Lower atomic intrinsics
diff --git a/test/Transforms/BBVectorize/X86/cmp-types.ll b/test/Transforms/BBVectorize/X86/cmp-types.ll
deleted file mode 100644
index fc1da1b0c60..00000000000
--- a/test/Transforms/BBVectorize/X86/cmp-types.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
-
-%"struct.btSoftBody" = type { float, float, float*, i8 }
-
-define void @test1(%"struct.btSoftBody"* %n1, %"struct.btSoftBody"* %n2) uwtable align 2 {
-entry:
-  %tobool15 = icmp ne %"struct.btSoftBody"* %n1, null
-  %cond16 = zext i1 %tobool15 to i32
-  %tobool21 = icmp ne %"struct.btSoftBody"* %n2, null
-  %cond22 = zext i1 %tobool21 to i32
-  ret void
-; CHECK-LABEL: @test1(
-}
-
diff --git a/test/Transforms/BBVectorize/X86/loop1.ll b/test/Transforms/BBVectorize/X86/loop1.ll
deleted file mode 100644
index a533713609a..00000000000
--- a/test/Transforms/BBVectorize/X86/loop1.ll
+++ /dev/null
@@ -1,61 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -basicaa -loop-unroll -unroll-partial-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
-; The second check covers the use of alias analysis (with loop unrolling).
-
-define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
-entry:
-  br label %for.body
-; CHECK-LABEL: @test1(
-; CHECK-UNRL-LABEL: @test1(
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds double, double* %in1, i64 %indvars.iv
-  %0 = load double, double* %arrayidx, align 8
-  %arrayidx2 = getelementptr inbounds double, double* %in2, i64 %indvars.iv
-  %1 = load double, double* %arrayidx2, align 8
-  %mul = fmul double %0, %0
-  %mul3 = fmul double %0, %1
-  %add = fadd double %mul, %mul3
-  %add4 = fadd double %1, %1
-  %add5 = fadd double %add4, %0
-  %mul6 = fmul double %0, %add5
-  %add7 = fadd double %add, %mul6
-  %mul8 = fmul double %1, %1
-  %add9 = fadd double %0, %0
-  %add10 = fadd double %add9, %0
-  %mul11 = fmul double %mul8, %add10
-  %add12 = fadd double %add7, %mul11
-  %arrayidx14 = getelementptr inbounds double, double* %out, i64 %indvars.iv
-  store double %add12, double* %arrayidx14, align 8
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 10
-  br i1 %exitcond, label %for.end, label %for.body
-; CHECK: insertelement
-; CHECK-NEXT: insertelement
-; CHECK-NEXT: fadd <2 x double>
-; CHECK-NEXT: insertelement
-; CHECK-NEXT: shufflevector
-; CHECK-NEXT: fadd <2 x double>
-; CHECK-NEXT: insertelement
-; CHECK-NEXT: fmul <2 x double>
-
-; CHECK-UNRL: %mul = fmul <2 x double> %2, %2
-; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3
-; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3
-; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3
-; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2
-; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5
-; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6
-; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3
-; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2
-; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2
-; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10
-; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
diff --git a/test/Transforms/BBVectorize/X86/pr15289.ll b/test/Transforms/BBVectorize/X86/pr15289.ll
deleted file mode 100644
index a383a260faf..00000000000
--- a/test/Transforms/BBVectorize/X86/pr15289.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; RUN: opt < %s -basicaa -bb-vectorize -disable-output
-; This is a bugpoint-reduced test case. It did not always assert, but does reproduce the bug
-; and running under valgrind (or some similar tool) will catch the error.
-
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin12.2.0"
-
-%0 = type { [10 x { float, float }], [10 x { float, float }], [10 x { float, float }], [10 x { float, float }], [10 x { float, float }] }
-%1 = type { [10 x [8 x i8]] }
-%2 = type { i64, i64 }
-%3 = type { [10 x i64], i64, i64, i64, i64, i64 }
-%4 = type { i64, i64, i64, i64, i64, i64 }
-%5 = type { [10 x i64] }
-%6 = type { [10 x float], [10 x float], [10 x float], [10 x float] }
-%struct.__st_parameter_dt.1.3.5.7 = type { %struct.__st_parameter_common.0.2.4.6, i64, i64*, i64*, i8*, i8*, i32, i32, i8*, i8*, i32, i32, i8*, [256 x i8], i32*, i64, i8*, i32, i32, i8*, i8*, i32, i32, i8*, i8*, i32, i32, i8*, i8*, i32, [4 x i8] }
-%struct.__st_parameter_common.0.2.4.6 = type { i32, i32, i8*, i32, i32, i8*, i32* }
-
-@cctenso_ = external unnamed_addr global %0, align 32
-@ctenso_ = external unnamed_addr global %1, align 32
-@i_dim_ = external unnamed_addr global %2, align 16
-@itenso1_ = external unnamed_addr global %3, align 32
-@itenso2_ = external unnamed_addr global %4, align 32
-@ltenso_ = external unnamed_addr global %5, align 32
-@rtenso_ = external unnamed_addr global %6, align 32
-@.cst = external unnamed_addr constant [8 x i8], align 8
-@.cst1 = external unnamed_addr constant [3 x i8], align 8
-@.cst2 = external unnamed_addr constant [29 x i8], align 8
-@.cst3 = external unnamed_addr constant [32 x i8], align 64
-
-define void @cart_to_dc2y_(double* noalias nocapture %xx, double* noalias nocapture %yy, double* noalias nocapture %zz, [5 x { double, double }]* noalias nocapture %c2ten) nounwind uwtable {
-entry:
-  %0 = fmul double undef, undef
-  %1 = fmul double undef, undef
-  %2 = fadd double undef, undef
-  %3 = fmul double undef, 0x3FE8B8B76E3E9919
-  %4 = fsub double %0, %1
-  %5 = fsub double -0.000000e+00, undef
-  %6 = fmul double undef, undef
-  %7 = fmul double %4, %6
-  %8 = fmul double undef, 2.000000e+00
-  %9 = fmul double %8, undef
-  %10 = fmul double undef, %9
-  %11 = fmul double %10, undef
-  %12 = fsub double undef, %7
-  %13 = fmul double %3, %12
-  %14 = fmul double %3, undef
-  %15 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 0, i32 0
-  store double %13, double* %15, align 8
-  %16 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 0, i32 1
-  %17 = fmul double undef, %8
-  %18 = fmul double %17, undef
-  %19 = fmul double undef, %18
-  %20 = fadd double undef, undef
-  %21 = fmul double %3, %19
-  %22 = fsub double -0.000000e+00, %21
-  %23 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 1, i32 0
-  store double %22, double* %23, align 8
-  %24 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 1, i32 1
-  %25 = fmul double undef, 0x3FE42F601A8C6794
-  %26 = fmul double undef, 2.000000e+00
-  %27 = fsub double %26, %0
-  %28 = fmul double %6, undef
-  %29 = fsub double undef, %28
-  %30 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 2, i32 0
-  store double undef, double* %30, align 8
-  %31 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 2, i32 1
-  %32 = fmul double undef, %17
-  %33 = fmul double undef, %17
-  %34 = fmul double undef, %32
-  %35 = fmul double undef, %33
-  %36 = fsub double undef, %35
-  %37 = fmul double %3, %34
-  %38 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 3, i32 0
-  store double %37, double* %38, align 8
-  %39 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 3, i32 1
-  %40 = fmul double undef, %8
-  %41 = fmul double undef, %40
-  %42 = fmul double undef, %41
-  %43 = fsub double undef, %42
-  %44 = fmul double %3, %43
-  %45 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 4, i32 0
-  store double %13, double* %45, align 8
-  %46 = getelementptr inbounds [5 x { double, double }], [5 x { double, double }]* %c2ten, i64 0, i64 4, i32 1
-  %47 = fsub double -0.000000e+00, %14
-  store double %47, double* %16, align 8
-  store double undef, double* %24, align 8
-  store double -0.000000e+00, double* %31, align 8
-  store double undef, double* %39, align 8
-  store double undef, double* %46, align 8
-  ret void
-}
-
-attributes #0 = { nounwind uwtable }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind }
diff --git a/test/Transforms/BBVectorize/X86/sh-rec.ll b/test/Transforms/BBVectorize/X86/sh-rec.ll
deleted file mode 100644
index 2cb9dbded22..00000000000
--- a/test/Transforms/BBVectorize/X86/sh-rec.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
-
-define void @ptoa() nounwind uwtable {
-entry:
-  %call = call i8* @malloc() nounwind
-  br i1 undef, label %return, label %if.end10
-
-if.end10:                                         ; preds = %entry
-  %incdec.ptr = getelementptr inbounds i8, i8* %call, i64 undef
-  %call17 = call i32 @ptou() nounwind
-  %incdec.ptr26.1 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -2
-  store i8 undef, i8* %incdec.ptr26.1, align 1
-  %div27.1 = udiv i32 %call17, 100
-  %rem.2 = urem i32 %div27.1, 10
-  %add2230.2 = or i32 %rem.2, 48
-  %conv25.2 = trunc i32 %add2230.2 to i8
-  %incdec.ptr26.2 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -3
-  store i8 %conv25.2, i8* %incdec.ptr26.2, align 1
-  %incdec.ptr26.3 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -4
-  store i8 undef, i8* %incdec.ptr26.3, align 1
-  %div27.3 = udiv i32 %call17, 10000
-  %rem.4 = urem i32 %div27.3, 10
-  %add2230.4 = or i32 %rem.4, 48
-  %conv25.4 = trunc i32 %add2230.4 to i8
-  %incdec.ptr26.4 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -5
-  store i8 %conv25.4, i8* %incdec.ptr26.4, align 1
-  %div27.4 = udiv i32 %call17, 100000
-  %rem.5 = urem i32 %div27.4, 10
-  %add2230.5 = or i32 %rem.5, 48
-  %conv25.5 = trunc i32 %add2230.5 to i8
-  %incdec.ptr26.5 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -6
-  store i8 %conv25.5, i8* %incdec.ptr26.5, align 1
-  %incdec.ptr26.6 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -7
-  store i8 0, i8* %incdec.ptr26.6, align 1
-  %incdec.ptr26.7 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -8
-  store i8 undef, i8* %incdec.ptr26.7, align 1
-  %div27.7 = udiv i32 %call17, 100000000
-  %rem.8 = urem i32 %div27.7, 10
-  %add2230.8 = or i32 %rem.8, 48
-  %conv25.8 = trunc i32 %add2230.8 to i8
-  %incdec.ptr26.8 = getelementptr inbounds i8, i8* %incdec.ptr, i64 -9
-  store i8 %conv25.8, i8* %incdec.ptr26.8, align 1
-  unreachable
-
-return:                                           ; preds = %entry
-  ret void
-; CHECK-LABEL: @ptoa(
-}
-
-declare noalias i8* @malloc() nounwind
-
-declare i32 @ptou()
diff --git a/test/Transforms/BBVectorize/X86/sh-rec2.ll b/test/Transforms/BBVectorize/X86/sh-rec2.ll
deleted file mode 100644
index d7a004c2138..00000000000
--- a/test/Transforms/BBVectorize/X86/sh-rec2.ll
+++ /dev/null
@@ -1,85 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-; RUN: opt < %s -basicaa -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
-
-%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352 = type { [280 x i16], i16, i64, i32, [8 x i16], [2 x [8 x i16]], i16, i16, [9 x i16], i16, i8, i8 }
-
-define void @gsm_encode(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352* %s, i16* %source, i8* %c) nounwind uwtable {
-entry:
-  %xmc = alloca [52 x i16], align 16
-  %arraydecay5 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 0
-  call void @Gsm_Coder(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352* %s, i16* %source, i16* undef, i16* null, i16* undef, i16* undef, i16* undef, i16* %arraydecay5) nounwind
-  %incdec.ptr136 = getelementptr inbounds i8, i8* %c, i64 10
-  %incdec.ptr157 = getelementptr inbounds i8, i8* %c, i64 11
-  store i8 0, i8* %incdec.ptr136, align 1
-  %arrayidx162 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 11
-  %0 = load i16, i16* %arrayidx162, align 2
-  %conv1631 = trunc i16 %0 to i8
-  %and164 = shl i8 %conv1631, 3
-  %shl165 = and i8 %and164, 56
-  %incdec.ptr172 = getelementptr inbounds i8, i8* %c, i64 12
-  store i8 %shl165, i8* %incdec.ptr157, align 1
-  %1 = load i16, i16* inttoptr (i64 2 to i16*), align 2
-  %conv1742 = trunc i16 %1 to i8
-  %and175 = shl i8 %conv1742, 1
-  %incdec.ptr183 = getelementptr inbounds i8, i8* %c, i64 13
-  store i8 %and175, i8* %incdec.ptr172, align 1
-  %incdec.ptr199 = getelementptr inbounds i8, i8* %c, i64 14
-  store i8 0, i8* %incdec.ptr183, align 1
-  %arrayidx214 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 15
-  %incdec.ptr220 = getelementptr inbounds i8, i8* %c, i64 15
-  store i8 0, i8* %incdec.ptr199, align 1
-  %2 = load i16, i16* %arrayidx214, align 2
-  %conv2223 = trunc i16 %2 to i8
-  %and223 = shl i8 %conv2223, 6
-  %incdec.ptr235 = getelementptr inbounds i8, i8* %c, i64 16
-  store i8 %and223, i8* %incdec.ptr220, align 1
-  %arrayidx240 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 19
-  %3 = load i16, i16* %arrayidx240, align 2
-  %conv2414 = trunc i16 %3 to i8
-  %and242 = shl i8 %conv2414, 2
-  %shl243 = and i8 %and242, 28
-  %incdec.ptr251 = getelementptr inbounds i8, i8* %c, i64 17
-  store i8 %shl243, i8* %incdec.ptr235, align 1
-  %incdec.ptr272 = getelementptr inbounds i8, i8* %c, i64 18
-  store i8 0, i8* %incdec.ptr251, align 1
-  %arrayidx282 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 25
-  %4 = load i16, i16* %arrayidx282, align 2
-  %conv2835 = trunc i16 %4 to i8
-  %and284 = and i8 %conv2835, 7
-  %incdec.ptr287 = getelementptr inbounds i8, i8* %c, i64 19
-  store i8 %and284, i8* %incdec.ptr272, align 1
-  %incdec.ptr298 = getelementptr inbounds i8, i8* %c, i64 20
-  store i8 0, i8* %incdec.ptr287, align 1
-  %incdec.ptr314 = getelementptr inbounds i8, i8* %c, i64 21
-  store i8 0, i8* %incdec.ptr298, align 1
-  %arrayidx319 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 26
-  %5 = load i16, i16* %arrayidx319, align 4
-  %conv3206 = trunc i16 %5 to i8
-  %and321 = shl i8 %conv3206, 4
-  %shl322 = and i8 %and321, 112
-  %incdec.ptr335 = getelementptr inbounds i8, i8* %c, i64 22
-  store i8 %shl322, i8* %incdec.ptr314, align 1
-  %arrayidx340 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 29
-  %6 = load i16, i16* %arrayidx340, align 2
-  %conv3417 = trunc i16 %6 to i8
-  %and342 = shl i8 %conv3417, 3
-  %shl343 = and i8 %and342, 56
-  %incdec.ptr350 = getelementptr inbounds i8, i8* %c, i64 23
-  store i8 %shl343, i8* %incdec.ptr335, align 1
-  %incdec.ptr366 = getelementptr inbounds i8, i8* %c, i64 24
-  store i8 0, i8* %incdec.ptr350, align 1
-  %arrayidx381 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 36
-  %incdec.ptr387 = getelementptr inbounds i8, i8* %c, i64 25
-  store i8 0, i8* %incdec.ptr366, align 1
-  %7 = load i16, i16* %arrayidx381, align 8
-  %conv3898 = trunc i16 %7 to i8
-  %and390 = shl i8 %conv3898, 6
-  store i8 %and390, i8* %incdec.ptr387, align 1
-  unreachable
-; CHECK-LABEL: @gsm_encode(
-}
-
-declare void @Gsm_Coder(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352*, i16*, i16*, i16*, i16*, i16*, i16*, i16*)
-
-declare void @llvm.trap() noreturn nounwind
diff --git a/test/Transforms/BBVectorize/X86/sh-rec3.ll b/test/Transforms/BBVectorize/X86/sh-rec3.ll
deleted file mode 100644
index 2096deb08a9..00000000000
--- a/test/Transforms/BBVectorize/X86/sh-rec3.ll
+++ /dev/null
@@ -1,170 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-; RUN: opt < %s -basicaa -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
-
-%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565 = type { [280 x i16], i16, i64, i32, [8 x i16], [2 x [8 x i16]], i16, i16, [9 x i16], i16, i8, i8 }
-
-define void @gsm_encode(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565* %s, i16* %source, i8* %c) nounwind uwtable {
-entry:
-  %LARc28 = alloca [2 x i64], align 16
-  %LARc28.sub = getelementptr inbounds [2 x i64], [2 x i64]* %LARc28, i64 0, i64 0
-  %tmpcast = bitcast [2 x i64]* %LARc28 to [8 x i16]*
-  %Nc = alloca [4 x i16], align 2
-  %Mc = alloca [4 x i16], align 2
-  %bc = alloca [4 x i16], align 2
-  %xmc = alloca [52 x i16], align 16
-  %arraydecay = bitcast [2 x i64]* %LARc28 to i16*
-  %arraydecay1 = getelementptr inbounds [4 x i16], [4 x i16]* %Nc, i64 0, i64 0
-  %arraydecay2 = getelementptr inbounds [4 x i16], [4 x i16]* %bc, i64 0, i64 0
-  %arraydecay3 = getelementptr inbounds [4 x i16], [4 x i16]* %Mc, i64 0, i64 0
-  %arraydecay5 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 0
-  call void @Gsm_Coder(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565* %s, i16* %source, i16* %arraydecay, i16* %arraydecay1, i16* %arraydecay2, i16* %arraydecay3, i16* undef, i16* %arraydecay5) nounwind
-  %0 = load i64, i64* %LARc28.sub, align 16
-  %1 = trunc i64 %0 to i32
-  %conv1 = lshr i32 %1, 2
-  %and = and i32 %conv1, 15
-  %or = or i32 %and, 208
-  %conv6 = trunc i32 %or to i8
-  %incdec.ptr = getelementptr inbounds i8, i8* %c, i64 1
-  store i8 %conv6, i8* %c, align 1
-  %conv84 = trunc i64 %0 to i8
-  %and9 = shl i8 %conv84, 6
-  %incdec.ptr15 = getelementptr inbounds i8, i8* %c, i64 2
-  store i8 %and9, i8* %incdec.ptr, align 1
-  %2 = lshr i64 %0, 50
-  %shr226.tr = trunc i64 %2 to i8
-  %conv25 = and i8 %shr226.tr, 7
-  %incdec.ptr26 = getelementptr inbounds i8, i8* %c, i64 3
-  store i8 %conv25, i8* %incdec.ptr15, align 1
-  %incdec.ptr42 = getelementptr inbounds i8, i8* %c, i64 4
-  store i8 0, i8* %incdec.ptr26, align 1
-  %arrayidx52 = getelementptr inbounds [8 x i16], [8 x i16]* %tmpcast, i64 0, i64 7
-  %3 = load i16, i16* %arrayidx52, align 2
-  %conv537 = trunc i16 %3 to i8
-  %and54 = and i8 %conv537, 7
-  %incdec.ptr57 = getelementptr inbounds i8, i8* %c, i64 5
-  store i8 %and54, i8* %incdec.ptr42, align 1
-  %incdec.ptr68 = getelementptr inbounds i8, i8* %c, i64 6
-  store i8 0, i8* %incdec.ptr57, align 1
-  %4 = load i16, i16* %arraydecay3, align 2
-  %conv748 = trunc i16 %4 to i8
-  %and75 = shl i8 %conv748, 5
-  %shl76 = and i8 %and75, 96
-  %incdec.ptr84 = getelementptr inbounds i8, i8* %c, i64 7
-  store i8 %shl76, i8* %incdec.ptr68, align 1
-  %arrayidx94 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 1
-  %5 = load i16, i16* %arrayidx94, align 2
-  %conv959 = trunc i16 %5 to i8
-  %and96 = shl i8 %conv959, 1
-  %shl97 = and i8 %and96, 14
-  %or103 = or i8 %shl97, 1
-  %incdec.ptr105 = getelementptr inbounds i8, i8* %c, i64 8
-  store i8 %or103, i8* %incdec.ptr84, align 1
-  %arrayidx115 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 4
-  %6 = bitcast i16* %arrayidx115 to i32*
-  %7 = load i32, i32* %6, align 8
-  %conv11610 = trunc i32 %7 to i8
-  %and117 = and i8 %conv11610, 7
-  %incdec.ptr120 = getelementptr inbounds i8, i8* %c, i64 9
-  store i8 %and117, i8* %incdec.ptr105, align 1
-  %8 = lshr i32 %7, 16
-  %and12330 = shl nuw nsw i32 %8, 5
-  %and123 = trunc i32 %and12330 to i8
-  %incdec.ptr136 = getelementptr inbounds i8, i8* %c, i64 10
-  store i8 %and123, i8* %incdec.ptr120, align 1
-  %incdec.ptr157 = getelementptr inbounds i8, i8* %c, i64 11
-  store i8 0, i8* %incdec.ptr136, align 1
-  %incdec.ptr172 = getelementptr inbounds i8, i8* %c, i64 12
-  store i8 0, i8* %incdec.ptr157, align 1
-  %arrayidx173 = getelementptr inbounds [4 x i16], [4 x i16]* %Nc, i64 0, i64 1
-  %9 = load i16, i16* %arrayidx173, align 2
-  %conv17412 = zext i16 %9 to i32
-  %and175 = shl nuw nsw i32 %conv17412, 1
-  %arrayidx177 = getelementptr inbounds [4 x i16], [4 x i16]* %bc, i64 0, i64 1
-  %10 = load i16, i16* %arrayidx177, align 2
-  %conv17826 = zext i16 %10 to i32
-  %shr17913 = lshr i32 %conv17826, 1
-  %and180 = and i32 %shr17913, 1
-  %or181 = or i32 %and175, %and180
-  %conv182 = trunc i32 %or181 to i8
-  %incdec.ptr183 = getelementptr inbounds i8, i8* %c, i64 13
-  store i8 %conv182, i8* %incdec.ptr172, align 1
-  %arrayidx188 = getelementptr inbounds [4 x i16], [4 x i16]* %Mc, i64 0, i64 1
-  %11 = load i16, i16* %arrayidx188, align 2
-  %conv18914 = trunc i16 %11 to i8
-  %and190 = shl i8 %conv18914, 5
-  %shl191 = and i8 %and190, 96
-  %incdec.ptr199 = getelementptr inbounds i8, i8* %c, i64 14
-  store i8 %shl191, i8* %incdec.ptr183, align 1
-  %arrayidx209 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 14
-  %12 = load i16, i16* %arrayidx209, align 4
-  %conv21015 = trunc i16 %12 to i8
-  %and211 = shl i8 %conv21015, 1
-  %shl212 = and i8 %and211, 14
-  %or218 = or i8 %shl212, 1
-  %incdec.ptr220 = getelementptr inbounds i8, i8* %c, i64 15
-  store i8 %or218, i8* %incdec.ptr199, align 1
-  %arrayidx225 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 16
-  %13 = bitcast i16* %arrayidx225 to i64*
-  %14 = load i64, i64* %13, align 16
-  %conv22616 = trunc i64 %14 to i8
-  %and227 = shl i8 %conv22616, 3
-  %shl228 = and i8 %and227, 56
-  %incdec.ptr235 = getelementptr inbounds i8, i8* %c, i64 16
-  store i8 %shl228, i8* %incdec.ptr220, align 1
-  %15 = lshr i64 %14, 32
-  %and23832 = shl nuw nsw i64 %15, 5
-  %and238 = trunc i64 %and23832 to i8
-  %incdec.ptr251 = getelementptr inbounds i8, i8* %c, i64 17
-  store i8 %and238, i8* %incdec.ptr235, align 1
-  %arrayidx266 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 23
-  %incdec.ptr272 = getelementptr inbounds i8, i8* %c, i64 18
-  store i8 0, i8* %incdec.ptr251, align 1
-  %16 = load i16, i16* %arrayidx266, align 2
-  %conv27418 = trunc i16 %16 to i8
-  %and275 = shl i8 %conv27418, 6
-  %incdec.ptr287 = getelementptr inbounds i8, i8* %c, i64 19
-  store i8 %and275, i8* %incdec.ptr272, align 1
-  %arrayidx288 = getelementptr inbounds [4 x i16], [4 x i16]* %Nc, i64 0, i64 2
-  %17 = load i16, i16* %arrayidx288, align 2
-  %conv28919 = zext i16 %17 to i32
-  %and290 = shl nuw nsw i32 %conv28919, 1
-  %arrayidx292 = getelementptr inbounds [4 x i16], [4 x i16]* %bc, i64 0, i64 2
-  %18 = load i16, i16* %arrayidx292, align 2
-  %conv29327 = zext i16 %18 to i32
-  %shr29420 = lshr i32 %conv29327, 1
-  %and295 = and i32 %shr29420, 1
-  %or296 = or i32 %and290, %and295
-  %conv297 = trunc i32 %or296 to i8
-  %incdec.ptr298 = getelementptr inbounds i8, i8* %c, i64 20
-  store i8 %conv297, i8* %incdec.ptr287, align 1
-  %conv30021 = trunc i16 %18 to i8
-  %and301 = shl i8 %conv30021, 7
-  %incdec.ptr314 = getelementptr inbounds i8, i8* %c, i64 21
-  store i8 %and301, i8* %incdec.ptr298, align 1
-  %incdec.ptr335 = getelementptr inbounds i8, i8* %c, i64 22
-  store i8 0, i8* %incdec.ptr314, align 1
-  %arrayidx340 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 29
-  %19 = load i16, i16* %arrayidx340, align 2
-  %conv34122 = trunc i16 %19 to i8
-  %and342 = shl i8 %conv34122, 3
-  %shl343 = and i8 %and342, 56
-  %incdec.ptr350 = getelementptr inbounds i8, i8* %c, i64 23
-  store i8 %shl343, i8* %incdec.ptr335, align 1
-  %arrayidx355 = getelementptr inbounds [52 x i16], [52 x i16]* %xmc, i64 0, i64 32
-  %20 = bitcast i16* %arrayidx355 to i32*
-  %21 = load i32, i32* %20, align 16
-  %conv35623 = shl i32 %21, 2
-  %shl358 = and i32 %conv35623, 28
-  %22 = lshr i32 %21, 17
-  %and363 = and i32 %22, 3
-  %or364 = or i32 %shl358, %and363
-  %conv365 = trunc i32 %or364 to i8
-  store i8 %conv365, i8* %incdec.ptr350, align 1
-  unreachable
-; CHECK-LABEL: @gsm_encode(
-}
-
-declare void @Gsm_Coder(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565*, i16*, i16*, i16*, i16*, i16*, i16*, i16*)
-
-declare void @llvm.trap() noreturn nounwind
diff --git a/test/Transforms/BBVectorize/X86/sh-types.ll b/test/Transforms/BBVectorize/X86/sh-types.ll
deleted file mode 100644
index fbff2fb86eb..00000000000
--- a/test/Transforms/BBVectorize/X86/sh-types.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
-
-define <4 x float> @test7(<4 x float> %A1, <4 x float> %B1, double %C1, double %C2, double %D1, double %D2) {
-        %A2 = shufflevector <4 x float> %A1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
-        %B2 = shufflevector <4 x float> %B1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
-        %X1 = shufflevector <4 x float> %A2, <4 x float> undef, <2 x i32> <i32 0, i32 1>
-        %X2 = shufflevector <4 x float> %B2, <4 x float> undef, <2 x i32> <i32 2, i32 3>
-        %Y1 = shufflevector <2 x float> %X1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-        %Y2 = shufflevector <2 x float> %X2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-
-	%M1 = fsub double %C1, %D1
-	%M2 = fsub double %C2, %D2
-	%N1 = fmul double %M1, %C1
-	%N2 = fmul double %M2, %C2
-	%Z1 = fadd double %N1, %D1
-	%Z2 = fadd double %N2, %D2
-
-        %R = fmul <4 x float> %Y1, %Y2
-        ret <4 x float> %R
-; CHECK-LABEL: @test7(
-; CHECK-NOT: <8 x float>
-; CHECK: ret <4 x float>
-}
-
diff --git a/test/Transforms/BBVectorize/X86/simple-int.ll b/test/Transforms/BBVectorize/X86/simple-int.ll
deleted file mode 100644
index ee5b5b3e4d0..00000000000
--- a/test/Transforms/BBVectorize/X86/simple-int.ll
+++ /dev/null
@@ -1,127 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-
-declare double @llvm.fma.f64(double, double, double)
-declare double @llvm.fmuladd.f64(double, double, double)
-declare double @llvm.cos.f64(double)
-declare double @llvm.powi.f64(double, i32)
-
-; Basic depth-3 chain with fma
-define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[X1:%.*]] = fsub double [[A1:%.*]], [[B1:%.*]]
-; CHECK-NEXT:    [[X2:%.*]] = fsub double [[A2:%.*]], [[B2:%.*]]
-; CHECK-NEXT:    [[Y1:%.*]] = call double @llvm.fma.f64(double [[X1]], double [[A1]], double [[C1:%.*]])
-; CHECK-NEXT:    [[Y2:%.*]] = call double @llvm.fma.f64(double [[X2]], double [[A2]], double [[C2:%.*]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd double [[Y1]], [[B1]]
-; CHECK-NEXT:    [[Z2:%.*]] = fadd double [[Y2]], [[B2]]
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1]], [[Z2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.fma.f64(double %X1, double %A1, double %C1)
-  %Y2 = call double @llvm.fma.f64(double %X2, double %A2, double %C2)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with fmuladd
-define double @test1a(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) {
-; CHECK-LABEL: @test1a(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1_V_I2_1:%.*]] = insertelement <2 x double> undef, double [[C1:%.*]], i32 0
-; CHECK-NEXT:    [[Y1_V_I2_2:%.*]] = insertelement <2 x double> [[Y1_V_I2_1]], double [[C2:%.*]], i32 1
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[X1]], <2 x double> [[X1_V_I0_2]], <2 x double> [[Y1_V_I2_2]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.fmuladd.f64(double %X1, double %A1, double %C1)
-  %Y2 = call double @llvm.fmuladd.f64(double %X2, double %A2, double %C2)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with cos
-define double @test2(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[X1:%.*]] = fsub double [[A1:%.*]], [[B1:%.*]]
-; CHECK-NEXT:    [[X2:%.*]] = fsub double [[A2:%.*]], [[B2:%.*]]
-; CHECK-NEXT:    [[Y1:%.*]] = call double @llvm.cos.f64(double [[X1]])
-; CHECK-NEXT:    [[Y2:%.*]] = call double @llvm.cos.f64(double [[X2]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd double [[Y1]], [[B1]]
-; CHECK-NEXT:    [[Z2:%.*]] = fadd double [[Y2]], [[B2]]
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1]], [[Z2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.cos.f64(double %X1)
-  %Y2 = call double @llvm.cos.f64(double %X2)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with powi
-define double @test3(double %A1, double %A2, double %B1, double %B2, i32 %P) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[X1:%.*]] = fsub double [[A1:%.*]], [[B1:%.*]]
-; CHECK-NEXT:    [[X2:%.*]] = fsub double [[A2:%.*]], [[B2:%.*]]
-; CHECK-NEXT:    [[Y1:%.*]] = call double @llvm.powi.f64(double [[X1]], i32 [[P:%.*]])
-; CHECK-NEXT:    [[Y2:%.*]] = call double @llvm.powi.f64(double [[X2]], i32 [[P]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd double [[Y1]], [[B1]]
-; CHECK-NEXT:    [[Z2:%.*]] = fadd double [[Y2]], [[B2]]
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1]], [[Z2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.powi.f64(double %X1, i32 %P)
-  %Y2 = call double @llvm.powi.f64(double %X2, i32 %P)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with powi (different powers: should not vectorize)
-define double @test4(double %A1, double %A2, double %B1, double %B2, i32 %P) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[X1:%.*]] = fsub double [[A1:%.*]], [[B1:%.*]]
-; CHECK-NEXT:    [[X2:%.*]] = fsub double [[A2:%.*]], [[B2:%.*]]
-; CHECK-NEXT:    [[P2:%.*]] = add i32 [[P:%.*]], 1
-; CHECK-NEXT:    [[Y1:%.*]] = call double @llvm.powi.f64(double [[X1]], i32 [[P]])
-; CHECK-NEXT:    [[Y2:%.*]] = call double @llvm.powi.f64(double [[X2]], i32 [[P2]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd double [[Y1]], [[B1]]
-; CHECK-NEXT:    [[Z2:%.*]] = fadd double [[Y2]], [[B2]]
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1]], [[Z2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %P2 = add i32 %P, 1
-  %Y1 = call double @llvm.powi.f64(double %X1, i32 %P)
-  %Y2 = call double @llvm.powi.f64(double %X2, i32 %P2)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
diff --git a/test/Transforms/BBVectorize/X86/simple-ldstr.ll b/test/Transforms/BBVectorize/X86/simple-ldstr.ll
deleted file mode 100644
index a81d9638f5e..00000000000
--- a/test/Transforms/BBVectorize/X86/simple-ldstr.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-
-; Simple 3-pair chain with loads and stores
-define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[I0_V_I0:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
-; CHECK-NEXT:    [[I1_V_I0:%.*]] = bitcast double* [[B:%.*]] to <2 x double>*
-; CHECK-NEXT:    [[I0:%.*]] = load <2 x double>, <2 x double>* [[I0_V_I0]], align 8
-; CHECK-NEXT:    [[I1:%.*]] = load <2 x double>, <2 x double>* [[I1_V_I0]], align 8
-; CHECK-NEXT:    [[MUL:%.*]] = fmul <2 x double> [[I0]], [[I1]]
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[C:%.*]] to <2 x double>*
-; CHECK-NEXT:    store <2 x double> [[MUL]], <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    ret void
-;
-entry:
-  %i0 = load double, double* %a, align 8
-  %i1 = load double, double* %b, align 8
-  %mul = fmul double %i0, %i1
-  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
-  %i3 = load double, double* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
-  %i4 = load double, double* %arrayidx4, align 8
-  %mul5 = fmul double %i3, %i4
-  store double %mul, double* %c, align 8
-  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
-  store double %mul5, double* %arrayidx5, align 8
-  ret void
-}
-
diff --git a/test/Transforms/BBVectorize/X86/simple.ll b/test/Transforms/BBVectorize/X86/simple.ll
deleted file mode 100644
index 0f7ddffbd19..00000000000
--- a/test/Transforms/BBVectorize/X86/simple.ll
+++ /dev/null
@@ -1,149 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-
-; Basic depth-3 chain
-define double @test1(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]]
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = fmul double %X1, %A1
-  %Y2 = fmul double %X2, %A2
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic chain
-define double @test1a(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @test1a(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]]
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[W1:%.*]] = fadd <2 x double> [[Y1]], [[Z1]]
-; CHECK-NEXT:    [[V1:%.*]] = fadd <2 x double> [[W1]], [[Z1]]
-; CHECK-NEXT:    [[Q1:%.*]] = fadd <2 x double> [[W1]], [[V1]]
-; CHECK-NEXT:    [[S1:%.*]] = fadd <2 x double> [[W1]], [[Q1]]
-; CHECK-NEXT:    [[S1_V_R1:%.*]] = extractelement <2 x double> [[S1]], i32 0
-; CHECK-NEXT:    [[S1_V_R2:%.*]] = extractelement <2 x double> [[S1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[S1_V_R1]], [[S1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = fmul double %X1, %A1
-  %Y2 = fmul double %X2, %A2
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %W1 = fadd double %Y1, %Z1
-  %W2 = fadd double %Y2, %Z2
-  %V1 = fadd double %W1, %Z1
-  %V2 = fadd double %W2, %Z2
-  %Q1 = fadd double %W1, %V1
-  %Q2 = fadd double %W2, %V2
-  %S1 = fadd double %W1, %Q1
-  %S2 = fadd double %W2, %Q2
-  %R  = fmul double %S1, %S2
-  ret double %R
-}
-
-; Basic depth-3 chain (last pair permuted)
-define double @test2(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]]
-; CHECK-NEXT:    [[Z1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B2]], i32 0
-; CHECK-NEXT:    [[Z1_V_I1_2:%.*]] = insertelement <2 x double> [[Z1_V_I1_1]], double [[B1]], i32 1
-; CHECK-NEXT:    [[Z2:%.*]] = fadd <2 x double> [[Y1]], [[Z1_V_I1_2]]
-; CHECK-NEXT:    [[Z2_V_R1:%.*]] = extractelement <2 x double> [[Z2]], i32 0
-; CHECK-NEXT:    [[Z2_V_R2:%.*]] = extractelement <2 x double> [[Z2]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z2_V_R2]], [[Z2_V_R1]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = fmul double %X1, %A1
-  %Y2 = fmul double %X2, %A2
-  %Z1 = fadd double %Y2, %B1
-  %Z2 = fadd double %Y1, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-4 chain (internal permutation)
-define double @test4(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]]
-; CHECK-NEXT:    [[Z1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B2]], i32 0
-; CHECK-NEXT:    [[Z1_V_I1_2:%.*]] = insertelement <2 x double> [[Z1_V_I1_1]], double [[B1]], i32 1
-; CHECK-NEXT:    [[Z2:%.*]] = fadd <2 x double> [[Y1]], [[Z1_V_I1_2]]
-; CHECK-NEXT:    [[Z2_V_R1:%.*]] = extractelement <2 x double> [[Z2]], i32 0
-; CHECK-NEXT:    [[Z2_V_R2:%.*]] = extractelement <2 x double> [[Z2]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z2_V_R2]], [[Z2_V_R1]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = fmul double %X1, %A1
-  %Y2 = fmul double %X2, %A2
-  %Z1 = fadd double %Y2, %B1
-  %Z2 = fadd double %Y1, %B2
-  %W1 = fadd double %Y2, %Z1
-  %W2 = fadd double %Y1, %Z2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic chain with shuffles
-define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
-; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[X1:%.*]] = sub <8 x i8> [[A1:%.*]], [[B1:%.*]]
-; CHECK-NEXT:    [[X2:%.*]] = sub <8 x i8> [[A2:%.*]], [[B2:%.*]]
-; CHECK-NEXT:    [[Y1:%.*]] = mul <8 x i8> [[X1]], [[A1]]
-; CHECK-NEXT:    [[Y2:%.*]] = mul <8 x i8> [[X2]], [[A2]]
-; CHECK-NEXT:    [[Z1:%.*]] = add <8 x i8> [[Y1]], [[B1]]
-; CHECK-NEXT:    [[Z2:%.*]] = add <8 x i8> [[Y2]], [[B2]]
-; CHECK-NEXT:    [[Q1:%.*]] = shufflevector <8 x i8> [[Z1]], <8 x i8> [[Z2]], <8 x i32> <i32 15, i32 8, i32 6, i32 1, i32 13, i32 10, i32 4, i32 3>
-; CHECK-NEXT:    [[Q2:%.*]] = shufflevector <8 x i8> [[Z2]], <8 x i8> undef, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 4, i32 4, i32 1>
-; CHECK-NEXT:    [[R:%.*]] = mul <8 x i8> [[Q1]], [[Q2]]
-; CHECK-NEXT:    ret <8 x i8> [[R]]
-;
-  %X1 = sub <8 x i8> %A1, %B1
-  %X2 = sub <8 x i8> %A2, %B2
-  %Y1 = mul <8 x i8> %X1, %A1
-  %Y2 = mul <8 x i8> %X2, %A2
-  %Z1 = add <8 x i8> %Y1, %B1
-  %Z2 = add <8 x i8> %Y2, %B2
-  %Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32> <i32 15, i32 8, i32 6, i32 1, i32 13, i32 10, i32 4, i32 3>
-  %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 4, i32 4, i32 1>
-  %R  = mul <8 x i8> %Q1, %Q2
-  ret <8 x i8> %R
-}
-
diff --git a/test/Transforms/BBVectorize/X86/vs-cast.ll b/test/Transforms/BBVectorize/X86/vs-cast.ll
deleted file mode 100644
index 297f2d5a7b3..00000000000
--- a/test/Transforms/BBVectorize/X86/vs-cast.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-define void @main() nounwind uwtable {
-; CHECK-LABEL: @main(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x i64> undef to i128
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i64> undef to i128
-; CHECK-NEXT:    ret void
-;
-entry:
-  %0 = bitcast <2 x i64> undef to i128
-  %1 = bitcast <2 x i64> undef to i128
-  ret void
-}
-
diff --git a/test/Transforms/BBVectorize/X86/wr-aliases.ll b/test/Transforms/BBVectorize/X86/wr-aliases.ll
deleted file mode 100644
index e34414988f3..00000000000
--- a/test/Transforms/BBVectorize/X86/wr-aliases.ll
+++ /dev/null
@@ -1,144 +0,0 @@
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -disable-basicaa -bb-vectorize -S < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-%class.QBezier.15 = type { double, double, double, double, double, double, double, double }
-
-; Function Attrs: nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
-
-; Function Attrs: uwtable
-declare fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval nocapture readonly align 8) #1
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #0
-
-; Function Attrs: nounwind
-declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
-
-define void @main_arrayctor.cont([10 x %class.QBezier.15]* %beziers, %class.QBezier.15* %agg.tmp.i, %class.QBezier.15* %agg.tmp55.i, %class.QBezier.15* %agg.tmp56.i) {
-newFuncRoot:
-  br label %arrayctor.cont
-
-arrayctor.cont.ret.exitStub:                      ; preds = %arrayctor.cont
-  ret void
-
-; CHECK-LABEL: @main_arrayctor.cont
-; CHECK: <2 x double>
-; CHECK: @_ZL12printQBezier7QBezier
-; CHECK: store double %mul8.i, double* %x3.i, align 16
-; CHECK: load double, double* %x3.i, align 16
-; CHECK: ret
-
-arrayctor.cont:                                   ; preds = %newFuncRoot
-  %ref.tmp.sroa.0.0.idx = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 0
-  store double 1.000000e+01, double* %ref.tmp.sroa.0.0.idx, align 16
-  %ref.tmp.sroa.2.0.idx1 = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 1
-  store double 2.000000e+01, double* %ref.tmp.sroa.2.0.idx1, align 8
-  %ref.tmp.sroa.3.0.idx2 = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 2
-  store double 3.000000e+01, double* %ref.tmp.sroa.3.0.idx2, align 16
-  %ref.tmp.sroa.4.0.idx3 = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 3
-  store double 4.000000e+01, double* %ref.tmp.sroa.4.0.idx3, align 8
-  %ref.tmp.sroa.5.0.idx4 = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 4
-  store double 5.000000e+01, double* %ref.tmp.sroa.5.0.idx4, align 16
-  %ref.tmp.sroa.6.0.idx5 = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 5
-  store double 6.000000e+01, double* %ref.tmp.sroa.6.0.idx5, align 8
-  %ref.tmp.sroa.7.0.idx6 = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 6
-  store double 7.000000e+01, double* %ref.tmp.sroa.7.0.idx6, align 16
-  %ref.tmp.sroa.8.0.idx7 = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 7
-  store double 8.000000e+01, double* %ref.tmp.sroa.8.0.idx7, align 8
-  %add.ptr = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1
-  %v0 = bitcast %class.QBezier.15* %agg.tmp.i to i8*
-  call void @llvm.lifetime.start(i64 64, i8* %v0)
-  %v1 = bitcast %class.QBezier.15* %agg.tmp55.i to i8*
-  call void @llvm.lifetime.start(i64 64, i8* %v1)
-  %v2 = bitcast %class.QBezier.15* %agg.tmp56.i to i8*
-  call void @llvm.lifetime.start(i64 64, i8* %v2)
-  %v3 = bitcast [10 x %class.QBezier.15]* %beziers to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %v0, i8* %v3, i64 64, i32 8, i1 false)
-  call fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval align 8 %agg.tmp.i)
-  %x2.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 2
-  %v4 = load double, double* %x2.i, align 16
-  %x3.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 4
-  %v5 = load double, double* %x3.i, align 16
-  %add.i = fadd double %v4, %v5
-  %mul.i = fmul double 5.000000e-01, %add.i
-  %x1.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 0
-  %v6 = load double, double* %x1.i, align 16
-  %add3.i = fadd double %v4, %v6
-  %mul4.i = fmul double 5.000000e-01, %add3.i
-  %x25.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 2
-  store double %mul4.i, double* %x25.i, align 16
-  %v7 = load double, double* %x3.i, align 16
-  %x4.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 6
-  %v8 = load double, double* %x4.i, align 16
-  %add7.i = fadd double %v7, %v8
-  %mul8.i = fmul double 5.000000e-01, %add7.i
-  store double %mul8.i, double* %x3.i, align 16
-  %v9 = load double, double* %x1.i, align 16
-  %x111.i = getelementptr inbounds %class.QBezier.15, %class.QBezier.15* %add.ptr, i64 0, i32 0
-  store double %v9, double* %x111.i, align 16
-  %v10 = load double, double* %x25.i, align 16
-  %add15.i = fadd double %mul.i, %v10
-  %mul16.i = fmul double 5.000000e-01, %add15.i
-  %x317.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 4
-  store double %mul16.i, double* %x317.i, align 16
-  %v11 = load double, double* %x3.i, align 16
-  %add19.i = fadd double %mul.i, %v11
-  %mul20.i = fmul double 5.000000e-01, %add19.i
-  store double %mul20.i, double* %x2.i, align 16
-  %v12 = load double, double* %x317.i, align 16
-  %add24.i = fadd double %v12, %mul20.i
-  %mul25.i = fmul double 5.000000e-01, %add24.i
-  store double %mul25.i, double* %x1.i, align 16
-  %x427.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 6
-  store double %mul25.i, double* %x427.i, align 16
-  %y2.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 3
-  %v13 = load double, double* %y2.i, align 8
-  %y3.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 5
-  %v14 = load double, double* %y3.i, align 8
-  %add28.i = fadd double %v13, %v14
-  %div.i = fmul double 5.000000e-01, %add28.i
-  %y1.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 1
-  %v15 = load double, double* %y1.i, align 8
-  %add30.i = fadd double %v13, %v15
-  %mul31.i = fmul double 5.000000e-01, %add30.i
-  %y232.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 3
-  store double %mul31.i, double* %y232.i, align 8
-  %v16 = load double, double* %y3.i, align 8
-  %y4.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 0, i32 7
-  %v17 = load double, double* %y4.i, align 8
-  %add34.i = fadd double %v16, %v17
-  %mul35.i = fmul double 5.000000e-01, %add34.i
-  store double %mul35.i, double* %y3.i, align 8
-  %v18 = load double, double* %y1.i, align 8
-  %y138.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 1
-  store double %v18, double* %y138.i, align 8
-  %v19 = load double, double* %y232.i, align 8
-  %add42.i = fadd double %div.i, %v19
-  %mul43.i = fmul double 5.000000e-01, %add42.i
-  %y344.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 5
-  store double %mul43.i, double* %y344.i, align 8
-  %v20 = load double, double* %y3.i, align 8
-  %add46.i = fadd double %div.i, %v20
-  %mul47.i = fmul double 5.000000e-01, %add46.i
-  store double %mul47.i, double* %y2.i, align 8
-  %v21 = load double, double* %y344.i, align 8
-  %add51.i = fadd double %v21, %mul47.i
-  %mul52.i = fmul double 5.000000e-01, %add51.i
-  store double %mul52.i, double* %y1.i, align 8
-  %y454.i = getelementptr inbounds [10 x %class.QBezier.15], [10 x %class.QBezier.15]* %beziers, i64 0, i64 1, i32 7
-  store double %mul52.i, double* %y454.i, align 8
-  %v22 = bitcast %class.QBezier.15* %add.ptr to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %v1, i8* %v22, i64 64, i32 8, i1 false)
-  call fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval align 8 %agg.tmp55.i)
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %v2, i8* %v3, i64 64, i32 8, i1 false)
-  call fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval align 8 %agg.tmp56.i)
-  call void @llvm.lifetime.end.p0i8(i64 64, i8* %v0)
-  call void @llvm.lifetime.end.p0i8(i64 64, i8* %v1)
-  call void @llvm.lifetime.end.p0i8(i64 64, i8* %v2)
-  br label %arrayctor.cont.ret.exitStub
-}
-
-attributes #0 = { nounwind }
-attributes #1 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/BBVectorize/cycle.ll b/test/Transforms/BBVectorize/cycle.ll
deleted file mode 100644
index 6bfa625ea5f..00000000000
--- a/test/Transforms/BBVectorize/cycle.ll
+++ /dev/null
@@ -1,112 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
-
-; This test checks the non-trivial pairing-induced cycle avoidance. Without this cycle avoidance, the algorithm would otherwise
-; want to select the pairs:
-; %div77 = fdiv double %sub74, %mul76.v.r1 <->   %div125 = fdiv double %mul121, %mul76.v.r2 (div125 depends on mul117)
-; %add84 = fadd double %sub83, 2.000000e+00 <->   %add127 = fadd double %mul126, 1.000000e+00 (add127 depends on div77)
-; %mul95 = fmul double %sub45.v.r1, %sub36.v.r1 <->   %mul88 = fmul double %sub36.v.r1, %sub87 (mul88 depends on add84)
-; %mul117 = fmul double %sub39.v.r1, %sub116 <->   %mul97 = fmul double %mul96, %sub39.v.r1 (mul97 depends on mul95)
-; and so a dependency cycle would be created.
-
-declare double @fabs(double) nounwind readnone
-define void @test1(double %a, double %b, double %c, double %add80, double %mul1, double %mul2.v.r1, double %mul73, double %sub, double %sub65, double %F.0, i32 %n.0, double %Bnm3.0, double %Bnm2.0, double %Bnm1.0, double %Anm3.0, double %Anm2.0, double %Anm1.0) {
-entry:
-  br label %go
-go:
-  %conv = sitofp i32 %n.0 to double
-  %add35 = fadd double %conv, %a
-  %sub36 = fadd double %add35, -1.000000e+00
-  %add38 = fadd double %conv, %b
-  %sub39 = fadd double %add38, -1.000000e+00
-  %add41 = fadd double %conv, %c
-  %sub42 = fadd double %add41, -1.000000e+00
-  %sub45 = fadd double %add35, -2.000000e+00
-  %sub48 = fadd double %add38, -2.000000e+00
-  %sub51 = fadd double %add41, -2.000000e+00
-  %mul52 = shl nsw i32 %n.0, 1
-  %sub53 = add nsw i32 %mul52, -1
-  %conv54 = sitofp i32 %sub53 to double
-  %sub56 = add nsw i32 %mul52, -3
-  %conv57 = sitofp i32 %sub56 to double
-  %sub59 = add nsw i32 %mul52, -5
-  %conv60 = sitofp i32 %sub59 to double
-  %mul61 = mul nsw i32 %n.0, %n.0
-  %conv62 = sitofp i32 %mul61 to double
-  %mul63 = fmul double %conv62, 3.000000e+00
-  %mul67 = fmul double %sub65, %conv
-  %add68 = fadd double %mul63, %mul67
-  %add69 = fadd double %add68, 2.000000e+00
-  %sub71 = fsub double %add69, %mul2.v.r1
-  %sub74 = fsub double %sub71, %mul73
-  %mul75 = fmul double %conv57, 2.000000e+00
-  %mul76 = fmul double %mul75, %sub42
-  %div77 = fdiv double %sub74, %mul76
-  %mul82 = fmul double %add80, %conv
-  %sub83 = fsub double %mul63, %mul82
-  %add84 = fadd double %sub83, 2.000000e+00
-  %sub86 = fsub double %add84, %mul2.v.r1
-  %sub87 = fsub double -0.000000e+00, %sub86
-  %mul88 = fmul double %sub36, %sub87
-  %mul89 = fmul double %mul88, %sub39
-  %mul90 = fmul double %conv54, 4.000000e+00
-  %mul91 = fmul double %mul90, %conv57
-  %mul92 = fmul double %mul91, %sub51
-  %mul93 = fmul double %mul92, %sub42
-  %div94 = fdiv double %mul89, %mul93
-  %mul95 = fmul double %sub45, %sub36
-  %mul96 = fmul double %mul95, %sub48
-  %mul97 = fmul double %mul96, %sub39
-  %sub99 = fsub double %conv, %a
-  %sub100 = fadd double %sub99, -2.000000e+00
-  %mul101 = fmul double %mul97, %sub100
-  %sub103 = fsub double %conv, %b
-  %sub104 = fadd double %sub103, -2.000000e+00
-  %mul105 = fmul double %mul101, %sub104
-  %mul106 = fmul double %conv57, 8.000000e+00
-  %mul107 = fmul double %mul106, %conv57
-  %mul108 = fmul double %mul107, %conv60
-  %sub111 = fadd double %add41, -3.000000e+00
-  %mul112 = fmul double %mul108, %sub111
-  %mul113 = fmul double %mul112, %sub51
-  %mul114 = fmul double %mul113, %sub42
-  %div115 = fdiv double %mul105, %mul114
-  %sub116 = fsub double -0.000000e+00, %sub36
-  %mul117 = fmul double %sub39, %sub116
-  %sub119 = fsub double %conv, %c
-  %sub120 = fadd double %sub119, -1.000000e+00
-  %mul121 = fmul double %mul117, %sub120
-  %mul123 = fmul double %mul75, %sub51
-  %mul124 = fmul double %mul123, %sub42
-  %div125 = fdiv double %mul121, %mul124
-  %mul126 = fmul double %div77, %sub
-  %add127 = fadd double %mul126, 1.000000e+00
-  %mul128 = fmul double %add127, %Anm1.0
-  %mul129 = fmul double %div94, %sub
-  %add130 = fadd double %div125, %mul129
-  %mul131 = fmul double %add130, %sub
-  %mul132 = fmul double %mul131, %Anm2.0
-  %add133 = fadd double %mul128, %mul132
-  %mul134 = fmul double %div115, %mul1
-  %mul135 = fmul double %mul134, %Anm3.0
-  %add136 = fadd double %add133, %mul135
-  %mul139 = fmul double %add127, %Bnm1.0
-  %mul143 = fmul double %mul131, %Bnm2.0
-  %add144 = fadd double %mul139, %mul143
-  %mul146 = fmul double %mul134, %Bnm3.0
-  %add147 = fadd double %add144, %mul146
-  %div148 = fdiv double %add136, %add147
-  %sub149 = fsub double %F.0, %div148
-  %div150 = fdiv double %sub149, %F.0
-  %call = tail call double @fabs(double %div150) nounwind readnone
-  %cmp = fcmp olt double %call, 0x3CB0000000000000
-  %cmp152 = icmp sgt i32 %n.0, 20000
-  %or.cond = or i1 %cmp, %cmp152
-  br i1 %or.cond, label %done, label %go
-done:
-  ret void
-; CHECK-LABEL: @test1(
-; CHECK: go:
-; CHECK: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
-; FIXME: When tree pruning is deterministic, include the entire output.
-}
diff --git a/test/Transforms/BBVectorize/func-alias.ll b/test/Transforms/BBVectorize/func-alias.ll
deleted file mode 100644
index ab72ec0e199..00000000000
--- a/test/Transforms/BBVectorize/func-alias.ll
+++ /dev/null
@@ -1,244 +0,0 @@
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-; RUN: opt < %s -basicaa -bb-vectorize -bb-vectorize-req-chain-depth=2 -instcombine -gvn -S | FileCheck %s
-; The chain length is set to 2 so that this will do some vectorization; check that the order of the functions is unchanged.
-
-%struct.descriptor_dimension = type { i64, i64, i64 }
-%struct.__st_parameter_common = type { i32, i32, i8*, i32, i32, i8*, i32* }
-%struct.__st_parameter_dt = type { %struct.__st_parameter_common, i64, i64*, i64*, i8*, i8*, i32, i32, i8*, i8*, i32, i32, i8*, [256 x i8], i32*, i64, i8*, i32, i32, i8*, i8*, i32, i32, i8*, i8*, i32, i32, i8*, i8*, i32, [4 x i8] }
-%"struct.array4_real(kind=4)" = type { i8*, i64, i64, [4 x %struct.descriptor_dimension] }
-%"struct.array4_integer(kind=4).73" = type { i8*, i64, i64, [4 x %struct.descriptor_dimension] }
-%struct.array4_unknown = type { i8*, i64, i64, [4 x %struct.descriptor_dimension] }
-
-@.cst4 = external unnamed_addr constant [11 x i8], align 8
-@.cst823 = external unnamed_addr constant [214 x i8], align 64
-@j.4580 = external global i32
-@j1.4581 = external global i32
-@nty1.4590 = external global [2 x i8]
-@nty2.4591 = external global [2 x i8]
-@xr1.4592 = external global float
-@xr2.4593 = external global float
-@yr1.4594 = external global float
-@yr2.4595 = external global float
-
-@__main1_MOD_iave = external unnamed_addr global i32
-@__main1_MOD_igrp = external global i32
-@__main1_MOD_iounit = external global i32
-@__main1_MOD_ityp = external global i32
-@__main1_MOD_mclmsg = external unnamed_addr global %struct.array4_unknown, align 32
-@__main1_MOD_mxdate = external unnamed_addr global %"struct.array4_integer(kind=4).73", align 32
-@__main1_MOD_rmxval = external unnamed_addr global %"struct.array4_real(kind=4)", align 32
-
-declare void @_gfortran_st_write(%struct.__st_parameter_dt*)
-declare void @_gfortran_st_write_done(%struct.__st_parameter_dt*)
-declare void @_gfortran_transfer_character_write(%struct.__st_parameter_dt*, i8*, i32)
-declare void @_gfortran_transfer_integer_write(%struct.__st_parameter_dt*, i8*, i32)
-declare void @_gfortran_transfer_real_write(%struct.__st_parameter_dt*, i8*, i32)
-
-define i1 @"prtmax__<bb 3>_<bb 34>"(%struct.__st_parameter_dt* %memtmp3, i32 %D.4627_188.reload) nounwind {
-; CHECK: prtmax__
-newFuncRoot:
-  br label %"<bb 34>"
-
-codeRepl80.exitStub:                              ; preds = %"<bb 34>"
-  ret i1 true
-
-"<bb 34>.<bb 25>_crit_edge.exitStub":             ; preds = %"<bb 34>"
-  ret i1 false
-
-"<bb 34>":                                        ; preds = %newFuncRoot
-  %tmp128 = getelementptr inbounds %struct.__st_parameter_dt, %struct.__st_parameter_dt* %memtmp3, i32 0, i32 0
-  %tmp129 = getelementptr inbounds %struct.__st_parameter_common, %struct.__st_parameter_common* %tmp128, i32 0, i32 2
-  store i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.cst4, i64 0, i64 0), i8** %tmp129, align 8
-  %tmp130 = getelementptr inbounds %struct.__st_parameter_dt, %struct.__st_parameter_dt* %memtmp3, i32 0, i32 0
-  %tmp131 = getelementptr inbounds %struct.__st_parameter_common, %struct.__st_parameter_common* %tmp130, i32 0, i32 3
-  store i32 31495, i32* %tmp131, align 4
-  %tmp132 = getelementptr inbounds %struct.__st_parameter_dt, %struct.__st_parameter_dt* %memtmp3, i32 0, i32 5
-  store i8* getelementptr inbounds ([214 x i8], [214 x i8]* @.cst823, i64 0, i64 0), i8** %tmp132, align 8
-  %tmp133 = getelementptr inbounds %struct.__st_parameter_dt, %struct.__st_parameter_dt* %memtmp3, i32 0, i32 6
-  store i32 214, i32* %tmp133, align 4
-  %tmp134 = getelementptr inbounds %struct.__st_parameter_dt, %struct.__st_parameter_dt* %memtmp3, i32 0, i32 0
-  %tmp135 = getelementptr inbounds %struct.__st_parameter_common, %struct.__st_parameter_common* %tmp134, i32 0, i32 0
-  store i32 4096, i32* %tmp135, align 4
-  %iounit.8748_288 = load i32, i32* @__main1_MOD_iounit, align 4
-  %tmp136 = getelementptr inbounds %struct.__st_parameter_dt, %struct.__st_parameter_dt* %memtmp3, i32 0, i32 0
-  %tmp137 = getelementptr inbounds %struct.__st_parameter_common, %struct.__st_parameter_common* %tmp136, i32 0, i32 1
-  store i32 %iounit.8748_288, i32* %tmp137, align 4
-  call void @_gfortran_st_write(%struct.__st_parameter_dt* %memtmp3) nounwind
-  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_integer_write to void (%struct.__st_parameter_dt*, i32*, i32)*)(%struct.__st_parameter_dt* %memtmp3, i32* @j.4580, i32 4) nounwind
-; CHECK: @_gfortran_transfer_integer_write
-  %D.75807_289 = load i8*, i8** getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 0), align 8
-  %j.8758_290 = load i32, i32* @j.4580, align 4
-  %D.75760_291 = sext i32 %j.8758_290 to i64
-  %iave.8736_292 = load i32, i32* @__main1_MOD_iave, align 4
-  %D.75620_293 = sext i32 %iave.8736_292 to i64
-  %D.75808_294 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 2, i32 0), align 8
-  %D.75809_295 = mul nsw i64 %D.75620_293, %D.75808_294
-  %igrp.8737_296 = load i32, i32* @__main1_MOD_igrp, align 4
-  %D.75635_297 = sext i32 %igrp.8737_296 to i64
-  %D.75810_298 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 1, i32 0), align 8
-  %D.75811_299 = mul nsw i64 %D.75635_297, %D.75810_298
-  %D.75812_300 = add nsw i64 %D.75809_295, %D.75811_299
-  %D.75813_301 = add nsw i64 %D.75760_291, %D.75812_300
-  %ityp.8750_302 = load i32, i32* @__main1_MOD_ityp, align 4
-  %D.75704_303 = sext i32 %ityp.8750_302 to i64
-  %D.75814_304 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 3, i32 0), align 8
-  %D.75815_305 = mul nsw i64 %D.75704_303, %D.75814_304
-  %D.75816_306 = add nsw i64 %D.75813_301, %D.75815_305
-  %D.75817_307 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 1), align 8
-  %D.75818_308 = add nsw i64 %D.75816_306, %D.75817_307
-  %tmp138 = bitcast i8* %D.75807_289 to [0 x float]*
-  %tmp139 = bitcast [0 x float]* %tmp138 to float*
-  %D.75819_309 = getelementptr inbounds float, float* %tmp139, i64 %D.75818_308
-  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* %D.75819_309, i32 4) nounwind
-; CHECK: @_gfortran_transfer_real_write
-  %D.75820_310 = load i8*, i8** getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 0), align 8
-  %j.8758_311 = load i32, i32* @j.4580, align 4
-  %D.75760_312 = sext i32 %j.8758_311 to i64
-  %iave.8736_313 = load i32, i32* @__main1_MOD_iave, align 4
-  %D.75620_314 = sext i32 %iave.8736_313 to i64
-  %D.75821_315 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 2, i32 0), align 8
-  %D.75822_316 = mul nsw i64 %D.75620_314, %D.75821_315
-  %igrp.8737_317 = load i32, i32* @__main1_MOD_igrp, align 4
-  %D.75635_318 = sext i32 %igrp.8737_317 to i64
-  %D.75823_319 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 1, i32 0), align 8
-  %D.75824_320 = mul nsw i64 %D.75635_318, %D.75823_319
-  %D.75825_321 = add nsw i64 %D.75822_316, %D.75824_320
-  %D.75826_322 = add nsw i64 %D.75760_312, %D.75825_321
-  %ityp.8750_323 = load i32, i32* @__main1_MOD_ityp, align 4
-  %D.75704_324 = sext i32 %ityp.8750_323 to i64
-  %D.75827_325 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 3, i32 0), align 8
-  %D.75828_326 = mul nsw i64 %D.75704_324, %D.75827_325
-  %D.75829_327 = add nsw i64 %D.75826_322, %D.75828_326
-  %D.75830_328 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 1), align 8
-  %D.75831_329 = add nsw i64 %D.75829_327, %D.75830_328
-  %tmp140 = bitcast i8* %D.75820_310 to [0 x [1 x i8]]*
-  %tmp141 = bitcast [0 x [1 x i8]]* %tmp140 to [1 x i8]*
-  %D.75832_330 = getelementptr inbounds [1 x i8], [1 x i8]* %tmp141, i64 %D.75831_329
-  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_character_write to void (%struct.__st_parameter_dt*, [1 x i8]*, i32)*)(%struct.__st_parameter_dt* %memtmp3, [1 x i8]* %D.75832_330, i32 1) nounwind
-; CHECK: @_gfortran_transfer_character_write
-  %D.75833_331 = load i8*, i8** getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 0), align 8
-  %j.8758_332 = load i32, i32* @j.4580, align 4
-  %D.75760_333 = sext i32 %j.8758_332 to i64
-  %iave.8736_334 = load i32, i32* @__main1_MOD_iave, align 4
-  %D.75620_335 = sext i32 %iave.8736_334 to i64
-  %D.75834_336 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 2, i32 0), align 8
-  %D.75835_337 = mul nsw i64 %D.75620_335, %D.75834_336
-  %igrp.8737_338 = load i32, i32* @__main1_MOD_igrp, align 4
-  %D.75635_339 = sext i32 %igrp.8737_338 to i64
-  %D.75836_340 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 1, i32 0), align 8
-  %D.75837_341 = mul nsw i64 %D.75635_339, %D.75836_340
-  %D.75838_342 = add nsw i64 %D.75835_337, %D.75837_341
-  %D.75839_343 = add nsw i64 %D.75760_333, %D.75838_342
-  %ityp.8750_344 = load i32, i32* @__main1_MOD_ityp, align 4
-  %D.75704_345 = sext i32 %ityp.8750_344 to i64
-  %D.75840_346 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 3, i32 0), align 8
-  %D.75841_347 = mul nsw i64 %D.75704_345, %D.75840_346
-  %D.75842_348 = add nsw i64 %D.75839_343, %D.75841_347
-  %D.75843_349 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 1), align 8
-  %D.75844_350 = add nsw i64 %D.75842_348, %D.75843_349
-  %tmp142 = bitcast i8* %D.75833_331 to [0 x i32]*
-  %tmp143 = bitcast [0 x i32]* %tmp142 to i32*
-  %D.75845_351 = getelementptr inbounds i32, i32* %tmp143, i64 %D.75844_350
-  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_integer_write to void (%struct.__st_parameter_dt*, i32*, i32)*)(%struct.__st_parameter_dt* %memtmp3, i32* %D.75845_351, i32 4) nounwind
-; CHECK: @_gfortran_transfer_integer_write
-  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* @xr1.4592, i32 4) nounwind
-; CHECK: @_gfortran_transfer_real_write
-  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* @yr1.4594, i32 4) nounwind
-; CHECK: @_gfortran_transfer_real_write
-  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_character_write to void (%struct.__st_parameter_dt*, [2 x i8]*, i32)*)(%struct.__st_parameter_dt* %memtmp3, [2 x i8]* @nty1.4590, i32 2) nounwind
-; CHECK: @_gfortran_transfer_character_write
-  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_integer_write to void (%struct.__st_parameter_dt*, i32*, i32)*)(%struct.__st_parameter_dt* %memtmp3, i32* @j1.4581, i32 4) nounwind
-; CHECK: @_gfortran_transfer_integer_write
-  %D.75807_352 = load i8*, i8** getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 0), align 8
-  %j1.8760_353 = load i32, i32* @j1.4581, align 4
-  %D.75773_354 = sext i32 %j1.8760_353 to i64
-  %iave.8736_355 = load i32, i32* @__main1_MOD_iave, align 4
-  %D.75620_356 = sext i32 %iave.8736_355 to i64
-  %D.75808_357 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 2, i32 0), align 8
-  %D.75809_358 = mul nsw i64 %D.75620_356, %D.75808_357
-  %igrp.8737_359 = load i32, i32* @__main1_MOD_igrp, align 4
-  %D.75635_360 = sext i32 %igrp.8737_359 to i64
-  %D.75810_361 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 1, i32 0), align 8
-  %D.75811_362 = mul nsw i64 %D.75635_360, %D.75810_361
-  %D.75812_363 = add nsw i64 %D.75809_358, %D.75811_362
-  %D.75846_364 = add nsw i64 %D.75773_354, %D.75812_363
-  %ityp.8750_365 = load i32, i32* @__main1_MOD_ityp, align 4
-  %D.75704_366 = sext i32 %ityp.8750_365 to i64
-  %D.75814_367 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 3, i64 3, i32 0), align 8
-  %D.75815_368 = mul nsw i64 %D.75704_366, %D.75814_367
-  %D.75847_369 = add nsw i64 %D.75846_364, %D.75815_368
-  %D.75817_370 = load i64, i64* getelementptr inbounds (%"struct.array4_real(kind=4)", %"struct.array4_real(kind=4)"* @__main1_MOD_rmxval, i64 0, i32 1), align 8
-  %D.75848_371 = add nsw i64 %D.75847_369, %D.75817_370
-  %tmp144 = bitcast i8* %D.75807_352 to [0 x float]*
-  %tmp145 = bitcast [0 x float]* %tmp144 to float*
-  %D.75849_372 = getelementptr inbounds float, float* %tmp145, i64 %D.75848_371
-  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* %D.75849_372, i32 4) nounwind
-; CHECK: @_gfortran_transfer_real_write
-  %D.75820_373 = load i8*, i8** getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 0), align 8
-  %j1.8760_374 = load i32, i32* @j1.4581, align 4
-  %D.75773_375 = sext i32 %j1.8760_374 to i64
-  %iave.8736_376 = load i32, i32* @__main1_MOD_iave, align 4
-  %D.75620_377 = sext i32 %iave.8736_376 to i64
-  %D.75821_378 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 2, i32 0), align 8
-  %D.75822_379 = mul nsw i64 %D.75620_377, %D.75821_378
-  %igrp.8737_380 = load i32, i32* @__main1_MOD_igrp, align 4
-  %D.75635_381 = sext i32 %igrp.8737_380 to i64
-  %D.75823_382 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 1, i32 0), align 8
-  %D.75824_383 = mul nsw i64 %D.75635_381, %D.75823_382
-  %D.75825_384 = add nsw i64 %D.75822_379, %D.75824_383
-  %D.75850_385 = add nsw i64 %D.75773_375, %D.75825_384
-  %ityp.8750_386 = load i32, i32* @__main1_MOD_ityp, align 4
-  %D.75704_387 = sext i32 %ityp.8750_386 to i64
-  %D.75827_388 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 3, i64 3, i32 0), align 8
-  %D.75828_389 = mul nsw i64 %D.75704_387, %D.75827_388
-  %D.75851_390 = add nsw i64 %D.75850_385, %D.75828_389
-  %D.75830_391 = load i64, i64* getelementptr inbounds (%struct.array4_unknown, %struct.array4_unknown* @__main1_MOD_mclmsg, i64 0, i32 1), align 8
-  %D.75852_392 = add nsw i64 %D.75851_390, %D.75830_391
-  %tmp146 = bitcast i8* %D.75820_373 to [0 x [1 x i8]]*
-  %tmp147 = bitcast [0 x [1 x i8]]* %tmp146 to [1 x i8]*
-  %D.75853_393 = getelementptr inbounds [1 x i8], [1 x i8]* %tmp147, i64 %D.75852_392
-  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_character_write to void (%struct.__st_parameter_dt*, [1 x i8]*, i32)*)(%struct.__st_parameter_dt* %memtmp3, [1 x i8]* %D.75853_393, i32 1) nounwind
-; CHECK: @_gfortran_transfer_character_write
-  %D.75833_394 = load i8*, i8** getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 0), align 8
-  %j1.8760_395 = load i32, i32* @j1.4581, align 4
-  %D.75773_396 = sext i32 %j1.8760_395 to i64
-  %iave.8736_397 = load i32, i32* @__main1_MOD_iave, align 4
-  %D.75620_398 = sext i32 %iave.8736_397 to i64
-  %D.75834_399 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 2, i32 0), align 8
-  %D.75835_400 = mul nsw i64 %D.75620_398, %D.75834_399
-  %igrp.8737_401 = load i32, i32* @__main1_MOD_igrp, align 4
-  %D.75635_402 = sext i32 %igrp.8737_401 to i64
-  %D.75836_403 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 1, i32 0), align 8
-  %D.75837_404 = mul nsw i64 %D.75635_402, %D.75836_403
-  %D.75838_405 = add nsw i64 %D.75835_400, %D.75837_404
-  %D.75854_406 = add nsw i64 %D.75773_396, %D.75838_405
-  %ityp.8750_407 = load i32, i32* @__main1_MOD_ityp, align 4
-  %D.75704_408 = sext i32 %ityp.8750_407 to i64
-  %D.75840_409 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 3, i64 3, i32 0), align 8
-  %D.75841_410 = mul nsw i64 %D.75704_408, %D.75840_409
-  %D.75855_411 = add nsw i64 %D.75854_406, %D.75841_410
-  %D.75843_412 = load i64, i64* getelementptr inbounds (%"struct.array4_integer(kind=4).73", %"struct.array4_integer(kind=4).73"* @__main1_MOD_mxdate, i64 0, i32 1), align 8
-  %D.75856_413 = add nsw i64 %D.75855_411, %D.75843_412
-  %tmp148 = bitcast i8* %D.75833_394 to [0 x i32]*
-  %tmp149 = bitcast [0 x i32]* %tmp148 to i32*
-  %D.75857_414 = getelementptr inbounds i32, i32* %tmp149, i64 %D.75856_413
-  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_integer_write to void (%struct.__st_parameter_dt*, i32*, i32)*)(%struct.__st_parameter_dt* %memtmp3, i32* %D.75857_414, i32 4) nounwind
-; CHECK: @_gfortran_transfer_integer_write
-  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* @xr2.4593, i32 4) nounwind
-; CHECK: @_gfortran_transfer_real_write
-  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_real_write to void (%struct.__st_parameter_dt*, float*, i32)*)(%struct.__st_parameter_dt* %memtmp3, float* @yr2.4595, i32 4) nounwind
-; CHECK: @_gfortran_transfer_real_write
-  call void bitcast (void (%struct.__st_parameter_dt*, i8*, i32)* @_gfortran_transfer_character_write to void (%struct.__st_parameter_dt*, [2 x i8]*, i32)*)(%struct.__st_parameter_dt* %memtmp3, [2 x i8]* @nty2.4591, i32 2) nounwind
-; CHECK: @_gfortran_transfer_character_write
-  call void @_gfortran_st_write_done(%struct.__st_parameter_dt* %memtmp3) nounwind
-; CHECK: @_gfortran_st_write_done
-  %j.8758_415 = load i32, i32* @j.4580, align 4
-  %D.4634_416 = icmp eq i32 %j.8758_415, %D.4627_188.reload
-  %j.8758_417 = load i32, i32* @j.4580, align 4
-  %j.8770_418 = add nsw i32 %j.8758_417, 1
-  store i32 %j.8770_418, i32* @j.4580, align 4
-  %tmp150 = icmp ne i1 %D.4634_416, false
-  br i1 %tmp150, label %codeRepl80.exitStub, label %"<bb 34>.<bb 25>_crit_edge.exitStub"
-}
-
diff --git a/test/Transforms/BBVectorize/ld1.ll b/test/Transforms/BBVectorize/ld1.ll
deleted file mode 100644
index 368c38aa5ce..00000000000
--- a/test/Transforms/BBVectorize/ld1.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
-
-define double @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
-entry:
-  %i0 = load double, double* %a, align 8
-  %i1 = load double, double* %b, align 8
-  %mul = fmul double %i0, %i1
-  %i2 = load double, double* %c, align 8
-  %add = fadd double %mul, %i2
-  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
-  %i3 = load double, double* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
-  %i4 = load double, double* %arrayidx4, align 8
-  %mul5 = fmul double %i3, %i4
-  %arrayidx6 = getelementptr inbounds double, double* %c, i64 1
-  %i5 = load double, double* %arrayidx6, align 8
-  %add7 = fadd double %mul5, %i5
-  %mul9 = fmul double %add, %i1
-  %add11 = fadd double %mul9, %i2
-  %mul13 = fmul double %add7, %i4
-  %add15 = fadd double %mul13, %i5
-  %mul16 = fmul double %add11, %add15
-  ret double %mul16
-; CHECK-LABEL: @test1(
-; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
-; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
-; CHECK: %i2.v.i0 = bitcast double* %c to <2 x double>*
-; CHECK: %i0 = load <2 x double>, <2 x double>* %i0.v.i0, align 8
-; CHECK: %i1 = load <2 x double>, <2 x double>* %i1.v.i0, align 8
-; CHECK: %mul = fmul <2 x double> %i0, %i1
-; CHECK: %i2 = load <2 x double>, <2 x double>* %i2.v.i0, align 8
-; CHECK: %add = fadd <2 x double> %mul, %i2
-; CHECK: %mul9 = fmul <2 x double> %add, %i1
-; CHECK: %add11 = fadd <2 x double> %mul9, %i2
-; CHECK: %add11.v.r1 = extractelement <2 x double> %add11, i32 0
-; CHECK: %add11.v.r2 = extractelement <2 x double> %add11, i32 1
-; CHECK: %mul16 = fmul double %add11.v.r1, %add11.v.r2
-; CHECK: ret double %mul16
-}
-
diff --git a/test/Transforms/BBVectorize/lit.local.cfg b/test/Transforms/BBVectorize/lit.local.cfg
deleted file mode 100644
index e71f3cc4c41..00000000000
--- a/test/Transforms/BBVectorize/lit.local.cfg
+++ /dev/null
@@ -1,3 +0,0 @@
-if not 'X86' in config.root.targets:
-    config.unsupported = True
-
diff --git a/test/Transforms/BBVectorize/loop1.ll b/test/Transforms/BBVectorize/loop1.ll
deleted file mode 100644
index 8ff5953cf46..00000000000
--- a/test/Transforms/BBVectorize/loop1.ll
+++ /dev/null
@@ -1,93 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
-; RUN: opt < %s -dont-improve-non-negative-phi-bits=false -basicaa -loop-unroll -unroll-threshold=45 -unroll-partial-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
-; The second check covers the use of alias analysis (with loop unrolling).
-
-define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
-entry:
-  br label %for.body
-; CHECK-LABEL: @test1(
-; CHECK-UNRL-LABEL: @test1(
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx = getelementptr inbounds double, double* %in1, i64 %indvars.iv
-  %0 = load double, double* %arrayidx, align 8
-  %arrayidx2 = getelementptr inbounds double, double* %in2, i64 %indvars.iv
-  %1 = load double, double* %arrayidx2, align 8
-  %mul = fmul double %0, %0
-  %mul3 = fmul double %0, %1
-  %add = fadd double %mul, %mul3
-  %add4 = fadd double %1, %1
-  %add5 = fadd double %add4, %0
-  %mul6 = fmul double %0, %add5
-  %add7 = fadd double %add, %mul6
-  %mul8 = fmul double %1, %1
-  %add9 = fadd double %0, %0
-  %add10 = fadd double %add9, %0
-  %mul11 = fmul double %mul8, %add10
-  %add12 = fadd double %add7, %mul11
-  %arrayidx14 = getelementptr inbounds double, double* %out, i64 %indvars.iv
-  store double %add12, double* %arrayidx14, align 8
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, 10
-  br i1 %exitcond, label %for.end, label %for.body
-; CHECK: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-; CHECK: %arrayidx = getelementptr inbounds double, double* %in1, i64 %indvars.iv
-; CHECK: %0 = load double, double* %arrayidx, align 8
-; CHECK: %arrayidx2 = getelementptr inbounds double, double* %in2, i64 %indvars.iv
-; CHECK: %1 = load double, double* %arrayidx2, align 8
-; CHECK: %mul = fmul double %0, %0
-; CHECK: %mul3 = fmul double %0, %1
-; CHECK: %add = fadd double %mul, %mul3
-; CHECK: %mul8 = fmul double %1, %1
-; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
-; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1
-; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2
-; CHECK: %2 = insertelement <2 x double> undef, double %0, i32 0
-; CHECK: %add5.v.i1.2 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK: %add5 = fadd <2 x double> %add4, %add5.v.i1.2
-; CHECK: %mul6.v.i0.2 = insertelement <2 x double> %2, double %mul8, i32 1
-; CHECK: %mul6 = fmul <2 x double> %mul6.v.i0.2, %add5
-; CHECK: %mul6.v.r1 = extractelement <2 x double> %mul6, i32 0
-; CHECK: %mul6.v.r2 = extractelement <2 x double> %mul6, i32 1
-; CHECK: %add7 = fadd double %add, %mul6.v.r1
-; CHECK: %add12 = fadd double %add7, %mul6.v.r2
-; CHECK: %arrayidx14 = getelementptr inbounds double, double* %out, i64 %indvars.iv
-; CHECK: store double %add12, double* %arrayidx14, align 8
-; CHECK: %indvars.iv.next = add i64 %indvars.iv, 1
-; CHECK: %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, 10
-; CHECK: br i1 %exitcond, label %for.end, label %for.body
-; CHECK-UNRL: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ]
-; CHECK-UNRL: %arrayidx = getelementptr inbounds double, double* %in1, i64 %indvars.iv
-; CHECK-UNRL: %0 = bitcast double* %arrayidx to <2 x double>*
-; CHECK-UNRL: %arrayidx2 = getelementptr inbounds double, double* %in2, i64 %indvars.iv
-; CHECK-UNRL: %1 = bitcast double* %arrayidx2 to <2 x double>*
-; CHECK-UNRL: %arrayidx14 = getelementptr inbounds double, double* %out, i64 %indvars.iv
-; CHECK-UNRL: %2 = load <2 x double>, <2 x double>* %0, align 8
-; CHECK-UNRL: %3 = load <2 x double>, <2 x double>* %1, align 8
-; CHECK-UNRL: %mul = fmul <2 x double> %2, %2
-; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3
-; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3
-; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3
-; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2
-; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5
-; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6
-; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3
-; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2
-; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2
-; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10
-; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11
-; CHECK-UNRL: %4 = bitcast double* %arrayidx14 to <2 x double>*
-; CHECK-UNRL: store <2 x double> %add12, <2 x double>* %4, align 8
-; CHECK-UNRL: %indvars.iv.next.1 = add nuw nsw i64 %indvars.iv, 2
-; CHECK-UNRL: %lftr.wideiv.1 = trunc i64 %indvars.iv.next.1 to i32
-; CHECK-UNRL: %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 10
-; CHECK-UNRL: br i1 %exitcond.1, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret void
-}
diff --git a/test/Transforms/BBVectorize/mem-op-depth.ll b/test/Transforms/BBVectorize/mem-op-depth.ll
deleted file mode 100644
index 732043b7f8e..00000000000
--- a/test/Transforms/BBVectorize/mem-op-depth.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-unknown-linux-gnu"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=6 -instcombine -gvn -S | FileCheck %s
-
-@A = common global [1024 x float] zeroinitializer, align 16
-@B = common global [1024 x float] zeroinitializer, align 16
-
-define i32 @test1() nounwind {
-; CHECK-LABEL: @test1(
-  %V1 = load float, float* getelementptr inbounds ([1024 x float], [1024 x float]* @A, i64 0, i64 0), align 16
-  %V2 = load float, float* getelementptr inbounds ([1024 x float], [1024 x float]* @A, i64 0, i64 1), align 4
-  %V3= load float, float* getelementptr inbounds ([1024 x float], [1024 x float]* @A, i64 0, i64 2), align 8
-  %V4 = load float, float* getelementptr inbounds ([1024 x float], [1024 x float]* @A, i64 0, i64 3), align 4
-; CHECK:   %V1 = load <4 x float>, <4 x float>* bitcast ([1024 x float]* @A to <4 x float>*), align 16
-  store float %V1, float* getelementptr inbounds ([1024 x float], [1024 x float]* @B, i64 0, i64 0), align 16
-  store float %V2, float* getelementptr inbounds ([1024 x float], [1024 x float]* @B, i64 0, i64 1), align 4
-  store float %V3, float* getelementptr inbounds ([1024 x float], [1024 x float]* @B, i64 0, i64 2), align 8
-  store float %V4, float* getelementptr inbounds ([1024 x float], [1024 x float]* @B, i64 0, i64 3), align 4
-; CHECK-NEXT: store <4 x float> %V1, <4 x float>* bitcast ([1024 x float]* @B to <4 x float>*), align 16
-  ret i32 0
-; CHECK-NEXT: ret i32 0
-}
diff --git a/test/Transforms/BBVectorize/metadata.ll b/test/Transforms/BBVectorize/metadata.ll
deleted file mode 100644
index f5580a88861..00000000000
--- a/test/Transforms/BBVectorize/metadata.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -S | FileCheck %s
-
-; Simple 3-pair chain with loads and stores (with fpmath)
-define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
-entry:
-  %i0 = load double, double* %a, align 8
-  %i1 = load double, double* %b, align 8
-  %mul = fmul double %i0, %i1, !fpmath !2
-  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
-  %i3 = load double, double* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
-  %i4 = load double, double* %arrayidx4, align 8
-  %mul5 = fmul double %i3, %i4, !fpmath !3
-  store double %mul, double* %c, align 8
-  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
-  store double %mul5, double* %arrayidx5, align 8
-  ret void
-; CHECK-LABEL: @test1(
-; CHECK: !fpmath
-; CHECK: ret void
-}
-
-; Simple 3-pair chain with loads and stores (ints with range)
-define void @test2(i64* %a, i64* %b, i64* %c) nounwind uwtable readonly {
-entry:
-  %i0 = load i64, i64* %a, align 8, !range !0
-  %i1 = load i64, i64* %b, align 8
-  %mul = mul i64 %i0, %i1
-  %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 1
-  %i3 = load i64, i64* %arrayidx3, align 8, !range !1
-  %arrayidx4 = getelementptr inbounds i64, i64* %b, i64 1
-  %i4 = load i64, i64* %arrayidx4, align 8
-  %mul5 = mul i64 %i3, %i4
-  store i64 %mul, i64* %c, align 8
-  %arrayidx5 = getelementptr inbounds i64, i64* %c, i64 1
-  store i64 %mul5, i64* %arrayidx5, align 8
-  ret void
-; CHECK-LABEL: @test2(
-; CHECK-NOT: !range
-; CHECK: ret void
-}
-
-!0 = !{i64 0, i64 2}
-!1 = !{i64 3, i64 5}
-
-!2 = !{ float 5.0 }
-!3 = !{ float 2.5 }
-
diff --git a/test/Transforms/BBVectorize/no-ldstr-conn.ll b/test/Transforms/BBVectorize/no-ldstr-conn.ll
deleted file mode 100644
index a84cd658560..00000000000
--- a/test/Transforms/BBVectorize/no-ldstr-conn.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=2 -instcombine -gvn -S | FileCheck %s
-
-; Make sure that things (specifically getelementptr) are not connected to loads
-; and stores via the address operand (which would be bad because the address
-; is really a scalar even after vectorization)
-define i64 @test2(i64 %a) nounwind uwtable readonly {
-entry:
-  %a1 = inttoptr i64 %a to i64*
-  %a2 = getelementptr i64, i64* %a1, i64 1
-  %a3 = getelementptr i64, i64* %a1, i64 2
-  %v2 = load i64, i64* %a2, align 8
-  %v3 = load i64, i64* %a3, align 8
-  %v2a = add i64 %v2, 5
-  %v3a = add i64 %v3, 7
-  store i64 %v2a, i64* %a2, align 8
-  store i64 %v3a, i64* %a3, align 8
-  %r = add i64 %v2, %v3
-  ret i64 %r
-; CHECK-LABEL: @test2(
-; CHECK-NOT: getelementptr i64, <2 x i64*>
-}
-
diff --git a/test/Transforms/BBVectorize/req-depth.ll b/test/Transforms/BBVectorize/req-depth.ll
deleted file mode 100644
index 2675354183a..00000000000
--- a/test/Transforms/BBVectorize/req-depth.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 3 -bb-vectorize-ignore-target-info -S | FileCheck %s -check-prefix=CHECK-RD3
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 2 -bb-vectorize-ignore-target-info -S | FileCheck %s -check-prefix=CHECK-RD2
-
-define double @test1(double %A1, double %A2, double %B1, double %B2) {
-	%X1 = fsub double %A1, %B1
-	%X2 = fsub double %A2, %B2
-	%Y1 = fmul double %X1, %A1
-	%Y2 = fmul double %X2, %A2
-	%R  = fmul double %Y1, %Y2
-	ret double %R
-; CHECK-RD3-LABEL: @test1(
-; CHECK-RD2-LABEL: @test1(
-; CHECK-RD3-NOT: <2 x double>
-; CHECK-RD2: <2 x double>
-}
-
diff --git a/test/Transforms/BBVectorize/search-limit.ll b/test/Transforms/BBVectorize/search-limit.ll
deleted file mode 100644
index be38d340260..00000000000
--- a/test/Transforms/BBVectorize/search-limit.ll
+++ /dev/null
@@ -1,46 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-search-limit=4 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-SL4
-
-define double @test1(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @test1(
-; CHECK-SL4-LABEL: @test1(
-; CHECK-SL4-NOT: <2 x double>
-; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
-; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
-	%X1 = fsub double %A1, %B1
-	%X2 = fsub double %A2, %B2
-; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
-	%Y1 = fmul double %X1, %A1
-	%Y2 = fmul double %X2, %A2
-; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
-	%Z1 = fadd double %Y1, %B1
-        ; Here we have a dependency chain: the short search limit will not
-        ; see past this chain and so will not see the second part of the
-        ; pair to vectorize.
-        %mul41 = fmul double %Z1, %Y2
-        %sub48 = fsub double %Z1, %mul41
-        %mul62 = fmul double %Z1, %sub48
-        %sub69 = fsub double %Z1, %mul62
-        %mul83 = fmul double %Z1, %sub69
-        %sub90 = fsub double %Z1, %mul83
-        %mul104 = fmul double %Z1, %sub90
-        %sub111 = fsub double %Z1, %mul104
-        %mul125 = fmul double %Z1, %sub111
-        %sub132 = fsub double %Z1, %mul125
-        %mul146 = fmul double %Z1, %sub132
-        %sub153 = fsub double %Z1, %mul146
-        ; end of chain.
-	%Z2 = fadd double %Y2, %B2
-; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
-	%R1  = fdiv double %Z1, %Z2
-        %R   = fmul double %R1, %sub153
-; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
-; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
-; CHECK: %R1 = fdiv double %Z1.v.r1, %Z1.v.r2
-	ret double %R
-; CHECK: ret double %R
-}
-
diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll
deleted file mode 100644
index dd5e90841a7..00000000000
--- a/test/Transforms/BBVectorize/simple-int.ll
+++ /dev/null
@@ -1,514 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-
-declare double @llvm.fma.f64(double, double, double)
-declare double @llvm.fmuladd.f64(double, double, double)
-declare double @llvm.cos.f64(double)
-declare double @llvm.powi.f64(double, i32)
-declare double @llvm.round.f64(double)
-declare double @llvm.copysign.f64(double, double)
-declare double @llvm.ceil.f64(double)
-declare double @llvm.nearbyint.f64(double)
-declare double @llvm.rint.f64(double)
-declare double @llvm.trunc.f64(double)
-declare double @llvm.floor.f64(double)
-declare double @llvm.fabs.f64(double)
-declare i64 @llvm.bswap.i64(i64)
-declare i64 @llvm.ctpop.i64(i64)
-declare i64 @llvm.ctlz.i64(i64, i1)
-declare i64 @llvm.cttz.i64(i64, i1)
-
-; Basic depth-3 chain with fma
-define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1_V_I2_1:%.*]] = insertelement <2 x double> undef, double [[C1:%.*]], i32 0
-; CHECK-NEXT:    [[Y1_V_I2_2:%.*]] = insertelement <2 x double> [[Y1_V_I2_1]], double [[C2:%.*]], i32 1
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x double> @llvm.fma.v2f64(<2 x double> [[X1]], <2 x double> [[X1_V_I0_2]], <2 x double> [[Y1_V_I2_2]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.fma.f64(double %X1, double %A1, double %C1)
-  %Y2 = call double @llvm.fma.f64(double %X2, double %A2, double %C2)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with fmuladd
-define double @test1a(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) {
-; CHECK-LABEL: @test1a(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1_V_I2_1:%.*]] = insertelement <2 x double> undef, double [[C1:%.*]], i32 0
-; CHECK-NEXT:    [[Y1_V_I2_2:%.*]] = insertelement <2 x double> [[Y1_V_I2_1]], double [[C2:%.*]], i32 1
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[X1]], <2 x double> [[X1_V_I0_2]], <2 x double> [[Y1_V_I2_2]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.fmuladd.f64(double %X1, double %A1, double %C1)
-  %Y2 = call double @llvm.fmuladd.f64(double %X2, double %A2, double %C2)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with cos
-define double @test2(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x double> @llvm.cos.v2f64(<2 x double> [[X1]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.cos.f64(double %X1)
-  %Y2 = call double @llvm.cos.f64(double %X2)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with powi
-define double @test3(double %A1, double %A2, double %B1, double %B2, i32 %P) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x double> @llvm.powi.v2f64(<2 x double> [[X1]], i32 [[P:%.*]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.powi.f64(double %X1, i32 %P)
-  %Y2 = call double @llvm.powi.f64(double %X2, i32 %P)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with powi (different powers: should not vectorize)
-define double @test4(double %A1, double %A2, double %B1, double %B2, i32 %P) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[X1:%.*]] = fsub double [[A1:%.*]], [[B1:%.*]]
-; CHECK-NEXT:    [[X2:%.*]] = fsub double [[A2:%.*]], [[B2:%.*]]
-; CHECK-NEXT:    [[P2:%.*]] = add i32 [[P:%.*]], 1
-; CHECK-NEXT:    [[Y1:%.*]] = call double @llvm.powi.f64(double [[X1]], i32 [[P]])
-; CHECK-NEXT:    [[Y2:%.*]] = call double @llvm.powi.f64(double [[X2]], i32 [[P2]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd double [[Y1]], [[B1]]
-; CHECK-NEXT:    [[Z2:%.*]] = fadd double [[Y2]], [[B2]]
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1]], [[Z2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %P2 = add i32 %P, 1
-  %Y1 = call double @llvm.powi.f64(double %X1, i32 %P)
-  %Y2 = call double @llvm.powi.f64(double %X2, i32 %P2)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with round
-define double @testround(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @testround(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x double> @llvm.round.v2f64(<2 x double> [[X1]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.round.f64(double %X1)
-  %Y2 = call double @llvm.round.f64(double %X2)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with copysign
-define double @testcopysign(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @testcopysign(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1_V_I1_2:%.*]] = shufflevector <2 x double> [[X1_V_I0_1]], <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x double> @llvm.copysign.v2f64(<2 x double> [[X1]], <2 x double> [[Y1_V_I1_2]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.copysign.f64(double %X1, double %A1)
-  %Y2 = call double @llvm.copysign.f64(double %X2, double %A1)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with ceil
-define double @testceil(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @testceil(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[X1]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.ceil.f64(double %X1)
-  %Y2 = call double @llvm.ceil.f64(double %X2)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with nearbyint
-define double @testnearbyint(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @testnearbyint(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> [[X1]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.nearbyint.f64(double %X1)
-  %Y2 = call double @llvm.nearbyint.f64(double %X2)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with rint
-define double @testrint(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @testrint(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x double> @llvm.rint.v2f64(<2 x double> [[X1]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.rint.f64(double %X1)
-  %Y2 = call double @llvm.rint.f64(double %X2)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with trunc
-define double @testtrunc(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @testtrunc(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x double> @llvm.trunc.v2f64(<2 x double> [[X1]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.trunc.f64(double %X1)
-  %Y2 = call double @llvm.trunc.f64(double %X2)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with floor
-define double @testfloor(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @testfloor(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[X1]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.floor.f64(double %X1)
-  %Y2 = call double @llvm.floor.f64(double %X2)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with fabs
-define double @testfabs(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @testfabs(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x double> @llvm.fabs.v2f64(<2 x double> [[X1]])
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = call double @llvm.fabs.f64(double %X1)
-  %Y2 = call double @llvm.fabs.f64(double %X2)
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain with bswap
-define i64 @testbswap(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
-; CHECK-LABEL: @testbswap(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x i64> undef, i64 [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x i64> [[X1_V_I1_1]], i64 [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x i64> undef, i64 [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x i64> [[X1_V_I0_1]], i64 [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = sub <2 x i64> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[X1]])
-; CHECK-NEXT:    [[Z1:%.*]] = add <2 x i64> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x i64> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x i64> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = mul i64 [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret i64 [[R]]
-;
-  %X1 = sub i64 %A1, %B1
-  %X2 = sub i64 %A2, %B2
-  %Y1 = call i64 @llvm.bswap.i64(i64 %X1)
-  %Y2 = call i64 @llvm.bswap.i64(i64 %X2)
-  %Z1 = add i64 %Y1, %B1
-  %Z2 = add i64 %Y2, %B2
-  %R  = mul i64 %Z1, %Z2
-  ret i64 %R
-}
-
-; Basic depth-3 chain with ctpop
-define i64 @testctpop(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
-; CHECK-LABEL: @testctpop(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x i64> undef, i64 [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x i64> [[X1_V_I1_1]], i64 [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x i64> undef, i64 [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x i64> [[X1_V_I0_1]], i64 [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = sub <2 x i64> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[X1]])
-; CHECK-NEXT:    [[Z1:%.*]] = add <2 x i64> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x i64> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x i64> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = mul i64 [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret i64 [[R]]
-;
-  %X1 = sub i64 %A1, %B1
-  %X2 = sub i64 %A2, %B2
-  %Y1 = call i64 @llvm.ctpop.i64(i64 %X1)
-  %Y2 = call i64 @llvm.ctpop.i64(i64 %X2)
-  %Z1 = add i64 %Y1, %B1
-  %Z2 = add i64 %Y2, %B2
-  %R  = mul i64 %Z1, %Z2
-  ret i64 %R
-}
-
-; Basic depth-3 chain with ctlz
-define i64 @testctlz(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
-; CHECK-LABEL: @testctlz(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x i64> undef, i64 [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x i64> [[X1_V_I1_1]], i64 [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x i64> undef, i64 [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x i64> [[X1_V_I0_1]], i64 [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = sub <2 x i64> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[X1]], i1 true)
-; CHECK-NEXT:    [[Z1:%.*]] = add <2 x i64> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x i64> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x i64> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = mul i64 [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret i64 [[R]]
-;
-  %X1 = sub i64 %A1, %B1
-  %X2 = sub i64 %A2, %B2
-  %Y1 = call i64 @llvm.ctlz.i64(i64 %X1, i1 true)
-  %Y2 = call i64 @llvm.ctlz.i64(i64 %X2, i1 true)
-  %Z1 = add i64 %Y1, %B1
-  %Z2 = add i64 %Y2, %B2
-  %R  = mul i64 %Z1, %Z2
-  ret i64 %R
-
-}
-
-; Basic depth-3 chain with ctlz
-define i64 @testctlzneg(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
-; CHECK-LABEL: @testctlzneg(
-; CHECK-NEXT:    [[X1:%.*]] = sub i64 [[A1:%.*]], [[B1:%.*]]
-; CHECK-NEXT:    [[X2:%.*]] = sub i64 [[A2:%.*]], [[B2:%.*]]
-; CHECK-NEXT:    [[Y1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[X1]], i1 true), !range !0
-; CHECK-NEXT:    [[Y2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[X2]], i1 false), !range !0
-; CHECK-NEXT:    [[Z1:%.*]] = add i64 [[Y1]], [[B1]]
-; CHECK-NEXT:    [[Z2:%.*]] = add i64 [[Y2]], [[B2]]
-; CHECK-NEXT:    [[R:%.*]] = mul i64 [[Z1]], [[Z2]]
-; CHECK-NEXT:    ret i64 [[R]]
-;
-  %X1 = sub i64 %A1, %B1
-  %X2 = sub i64 %A2, %B2
-  %Y1 = call i64 @llvm.ctlz.i64(i64 %X1, i1 true)
-  %Y2 = call i64 @llvm.ctlz.i64(i64 %X2, i1 false)
-  %Z1 = add i64 %Y1, %B1
-  %Z2 = add i64 %Y2, %B2
-  %R  = mul i64 %Z1, %Z2
-  ret i64 %R
-}
-
-; Basic depth-3 chain with cttz
-define i64 @testcttz(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
-; CHECK-LABEL: @testcttz(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x i64> undef, i64 [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x i64> [[X1_V_I1_1]], i64 [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x i64> undef, i64 [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x i64> [[X1_V_I0_1]], i64 [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = sub <2 x i64> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> [[X1]], i1 true)
-; CHECK-NEXT:    [[Z1:%.*]] = add <2 x i64> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x i64> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x i64> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = mul i64 [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret i64 [[R]]
-;
-  %X1 = sub i64 %A1, %B1
-  %X2 = sub i64 %A2, %B2
-  %Y1 = call i64 @llvm.cttz.i64(i64 %X1, i1 true)
-  %Y2 = call i64 @llvm.cttz.i64(i64 %X2, i1 true)
-  %Z1 = add i64 %Y1, %B1
-  %Z2 = add i64 %Y2, %B2
-  %R  = mul i64 %Z1, %Z2
-  ret i64 %R
-
-}
-
-; Basic depth-3 chain with cttz
-define i64 @testcttzneg(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
-; CHECK-LABEL: @testcttzneg(
-; CHECK-NEXT:    [[X1:%.*]] = sub i64 [[A1:%.*]], [[B1:%.*]]
-; CHECK-NEXT:    [[X2:%.*]] = sub i64 [[A2:%.*]], [[B2:%.*]]
-; CHECK-NEXT:    [[Y1:%.*]] = call i64 @llvm.cttz.i64(i64 [[X1]], i1 true), !range !0
-; CHECK-NEXT:    [[Y2:%.*]] = call i64 @llvm.cttz.i64(i64 [[X2]], i1 false), !range !0
-; CHECK-NEXT:    [[Z1:%.*]] = add i64 [[Y1]], [[B1]]
-; CHECK-NEXT:    [[Z2:%.*]] = add i64 [[Y2]], [[B2]]
-; CHECK-NEXT:    [[R:%.*]] = mul i64 [[Z1]], [[Z2]]
-; CHECK-NEXT:    ret i64 [[R]]
-;
-  %X1 = sub i64 %A1, %B1
-  %X2 = sub i64 %A2, %B2
-  %Y1 = call i64 @llvm.cttz.i64(i64 %X1, i1 true)
-  %Y2 = call i64 @llvm.cttz.i64(i64 %X2, i1 false)
-  %Z1 = add i64 %Y1, %B1
-  %Z2 = add i64 %Y2, %B2
-  %R  = mul i64 %Z1, %Z2
-  ret i64 %R
-}
-
-; CHECK: declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #0
-; CHECK: declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) #0
-; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) #0
-; CHECK: declare <2 x double> @llvm.powi.v2f64(<2 x double>, i32) #0
-; CHECK: declare <2 x double> @llvm.round.v2f64(<2 x double>) #0
-; CHECK: declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) #0
-; CHECK: declare <2 x double> @llvm.ceil.v2f64(<2 x double>) #0
-; CHECK: declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0
-; CHECK: declare <2 x double> @llvm.rint.v2f64(<2 x double>) #0
-; CHECK: declare <2 x double> @llvm.trunc.v2f64(<2 x double>) #0
-; CHECK: declare <2 x double> @llvm.floor.v2f64(<2 x double>) #0
-; CHECK: declare <2 x double> @llvm.fabs.v2f64(<2 x double>) #0
-; CHECK: declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>) #0
-; CHECK: declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) #0
-; CHECK: declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) #0
-; CHECK: declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) #0
-; CHECK: attributes #0 = { nounwind readnone speculatable }
diff --git a/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll b/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
deleted file mode 100644
index fcc0236bae9..00000000000
--- a/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
+++ /dev/null
@@ -1,134 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO
-
-; FIXME: re-enable this once pointer vectors work properly
-; XFAIL: *
-
-; Simple 3-pair chain also with loads and stores (using ptrs and gep)
-define double @test1(i64* %a, i64* %b, i64* %c) nounwind uwtable readonly {
-entry:
-  %i0 = load i64, i64* %a, align 8
-  %i1 = load i64, i64* %b, align 8
-  %mul = mul i64 %i0, %i1
-  %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 1
-  %i3 = load i64, i64* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds i64, i64* %b, i64 1
-  %i4 = load i64, i64* %arrayidx4, align 8
-  %mul5 = mul i64 %i3, %i4
-  %ptr = inttoptr i64 %mul to double*
-  %ptr5 = inttoptr i64 %mul5 to double*
-  %aptr = getelementptr inbounds double, double* %ptr, i64 2
-  %aptr5 = getelementptr inbounds double, double* %ptr5, i64 3
-  %av = load double, double* %aptr, align 16
-  %av5 = load double, double* %aptr5, align 16
-  %r = fmul double %av, %av5
-  store i64 %mul, i64* %c, align 8
-  %arrayidx5 = getelementptr inbounds i64, i64* %c, i64 1
-  store i64 %mul5, i64* %arrayidx5, align 8
-  ret double %r
-; CHECK-LABEL: @test1(
-; CHECK: %i0.v.i0 = bitcast i64* %a to <2 x i64>*
-; CHECK: %i1.v.i0 = bitcast i64* %b to <2 x i64>*
-; CHECK: %i0 = load <2 x i64>, <2 x i64>* %i0.v.i0, align 8
-; CHECK: %i1 = load <2 x i64>, <2 x i64>* %i1.v.i0, align 8
-; CHECK: %mul = mul <2 x i64> %i0, %i1
-; CHECK: %ptr = inttoptr <2 x i64> %mul to <2 x double*>
-; CHECK: %aptr = getelementptr inbounds double, <2 x double*> %ptr, <2 x i64> <i64 2, i64 3>
-; CHECK: %aptr.v.r1 = extractelement <2 x double*> %aptr, i32 0
-; CHECK: %aptr.v.r2 = extractelement <2 x double*> %aptr, i32 1
-; CHECK: %av = load double, double* %aptr.v.r1, align 16
-; CHECK: %av5 = load double, double* %aptr.v.r2, align 16
-; CHECK: %r = fmul double %av, %av5
-; CHECK: %0 = bitcast i64* %c to <2 x i64>*
-; CHECK: store <2 x i64> %mul, <2 x i64>* %0, align 8
-; CHECK: ret double %r
-; CHECK-AO-LABEL: @test1(
-; CHECK-AO-NOT: load <2 x
-}
-
-; Simple 3-pair chain with loads and stores (using ptrs and gep)
-define void @test2(i64** %a, i64** %b, i64** %c) nounwind uwtable readonly {
-entry:
-  %i0 = load i64*, i64** %a, align 8
-  %i1 = load i64*, i64** %b, align 8
-  %arrayidx3 = getelementptr inbounds i64*, i64** %a, i64 1
-  %i3 = load i64*, i64** %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds i64*, i64** %b, i64 1
-  %i4 = load i64*, i64** %arrayidx4, align 8
-  %o1 = load i64, i64* %i1, align 8
-  %o4 = load i64, i64* %i4, align 8
-  %ptr0 = getelementptr inbounds i64, i64* %i0, i64 %o1
-  %ptr3 = getelementptr inbounds i64, i64* %i3, i64 %o4
-  store i64* %ptr0, i64** %c, align 8
-  %arrayidx5 = getelementptr inbounds i64*, i64** %c, i64 1
-  store i64* %ptr3, i64** %arrayidx5, align 8
-  ret void
-; CHECK-LABEL: @test2(
-; CHECK: %i0.v.i0 = bitcast i64** %a to <2 x i64*>*
-; CHECK: %i1 = load i64*, i64** %b, align 8
-; CHECK: %i0 = load <2 x i64*>, <2 x i64*>* %i0.v.i0, align 8
-; CHECK: %arrayidx4 = getelementptr inbounds i64*, i64** %b, i64 1
-; CHECK: %i4 = load i64*, i64** %arrayidx4, align 8
-; CHECK: %o1 = load i64, i64* %i1, align 8
-; CHECK: %o4 = load i64, i64* %i4, align 8
-; CHECK: %ptr0.v.i1.1 = insertelement <2 x i64> undef, i64 %o1, i32 0
-; CHECK: %ptr0.v.i1.2 = insertelement <2 x i64> %ptr0.v.i1.1, i64 %o4, i32 1
-; CHECK: %ptr0 = getelementptr inbounds i64, <2 x i64*> %i0, <2 x i64> %ptr0.v.i1.2
-; CHECK: %0 = bitcast i64** %c to <2 x i64*>*
-; CHECK: store <2 x i64*> %ptr0, <2 x i64*>* %0, align 8
-; CHECK: ret void
-; CHECK-AO-LABEL: @test2(
-; CHECK-AO-NOT: <2 x
-}
-
-; Simple 3-pair chain with loads and stores (using ptrs and gep)
-; using pointer vectors.
-define void @test3(<2 x i64*>* %a, <2 x i64*>* %b, <2 x i64*>* %c) nounwind uwtable readonly {
-entry:
-  %i0 = load <2 x i64*>, <2 x i64*>* %a, align 8
-  %i1 = load <2 x i64*>, <2 x i64*>* %b, align 8
-  %arrayidx3 = getelementptr inbounds <2 x i64*>, <2 x i64*>* %a, i64 1
-  %i3 = load <2 x i64*>, <2 x i64*>* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds <2 x i64*>, <2 x i64*>* %b, i64 1
-  %i4 = load <2 x i64*>, <2 x i64*>* %arrayidx4, align 8
-  %j1 = extractelement <2 x i64*> %i1, i32 0
-  %j4 = extractelement <2 x i64*> %i4, i32 0
-  %o1 = load i64, i64* %j1, align 8
-  %o4 = load i64, i64* %j4, align 8
-  %j0 = extractelement <2 x i64*> %i0, i32 0
-  %j3 = extractelement <2 x i64*> %i3, i32 0
-  %ptr0 = getelementptr inbounds i64, i64* %j0, i64 %o1
-  %ptr3 = getelementptr inbounds i64, i64* %j3, i64 %o4
-  %qtr0 = insertelement <2 x i64*> undef, i64* %ptr0, i32 0
-  %rtr0 = insertelement <2 x i64*> %qtr0, i64* %ptr0, i32 1
-  %qtr3 = insertelement <2 x i64*> undef, i64* %ptr3, i32 0
-  %rtr3 = insertelement <2 x i64*> %qtr3, i64* %ptr3, i32 1
-  store <2 x i64*> %rtr0, <2 x i64*>* %c, align 8
-  %arrayidx5 = getelementptr inbounds <2 x i64*>, <2 x i64*>* %c, i64 1
-  store <2 x i64*> %rtr3, <2 x i64*>* %arrayidx5, align 8
-  ret void
-; CHECK-LABEL: @test3(
-; CHECK: %i0.v.i0 = bitcast <2 x i64*>* %a to <4 x i64*>*
-; CHECK: %i1 = load <2 x i64*>, <2 x i64*>* %b, align 8
-; CHECK: %i0 = load <4 x i64*>, <4 x i64*>* %i0.v.i0, align 8
-; CHECK: %arrayidx4 = getelementptr inbounds <2 x i64*>, <2 x i64*>* %b, i64 1
-; CHECK: %i4 = load <2 x i64*>, <2 x i64*>* %arrayidx4, align 8
-; CHECK: %j1 = extractelement <2 x i64*> %i1, i32 0
-; CHECK: %j4 = extractelement <2 x i64*> %i4, i32 0
-; CHECK: %o1 = load i64, i64* %j1, align 8
-; CHECK: %o4 = load i64, i64* %j4, align 8
-; CHECK: %ptr0.v.i1.1 = insertelement <2 x i64> undef, i64 %o1, i32 0
-; CHECK: %ptr0.v.i1.2 = insertelement <2 x i64> %ptr0.v.i1.1, i64 %o4, i32 1
-; CHECK: %ptr0.v.i0 = shufflevector <4 x i64*> %i0, <4 x i64*> undef, <2 x i32> <i32 0, i32 2>
-; CHECK: %ptr0 = getelementptr inbounds i64, <2 x i64*> %ptr0.v.i0, <2 x i64> %ptr0.v.i1.2
-; CHECK: %rtr0 = shufflevector <2 x i64*> %ptr0, <2 x i64*> undef, <2 x i32> zeroinitializer
-; CHECK: %rtr3 = shufflevector <2 x i64*> %ptr0, <2 x i64*> undef, <2 x i32> <i32 1, i32 1>
-; CHECK: %0 = bitcast <2 x i64*>* %c to <4 x i64*>*
-; CHECK: %1 = shufflevector <2 x i64*> %rtr0, <2 x i64*> %rtr3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK: store <4 x i64*> %1, <4 x i64*>* %0, align 8
-; CHECK: ret void
-; CHECK-AO-LABEL: @test3(
-; CHECK-AO-NOT: <4 x
-}
-
diff --git a/test/Transforms/BBVectorize/simple-ldstr.ll b/test/Transforms/BBVectorize/simple-ldstr.ll
deleted file mode 100644
index 56c1a06b42e..00000000000
--- a/test/Transforms/BBVectorize/simple-ldstr.ll
+++ /dev/null
@@ -1,170 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO
-
-; Simple 3-pair chain with loads and stores
-define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
-entry:
-  %i0 = load double, double* %a, align 8
-  %i1 = load double, double* %b, align 8
-  %mul = fmul double %i0, %i1
-  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
-  %i3 = load double, double* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
-  %i4 = load double, double* %arrayidx4, align 8
-  %mul5 = fmul double %i3, %i4
-  store double %mul, double* %c, align 8
-  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
-  store double %mul5, double* %arrayidx5, align 8
-  ret void
-; CHECK-LABEL: @test1(
-; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
-; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
-; CHECK: %i0 = load <2 x double>, <2 x double>* %i0.v.i0, align 8
-; CHECK: %i1 = load <2 x double>, <2 x double>* %i1.v.i0, align 8
-; CHECK: %mul = fmul <2 x double> %i0, %i1
-; CHECK: %0 = bitcast double* %c to <2 x double>*
-; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
-; CHECK: ret void
-; CHECK-AO-LABEL: @test1(
-; CHECK-AO-NOT: <2 x double>
-}
-
-; Simple chain with extending loads and stores
-define void @test2(float* %a, float* %b, double* %c) nounwind uwtable readonly {
-entry:
-  %i0f = load float, float* %a, align 4
-  %i0 = fpext float %i0f to double
-  %i1f = load float, float* %b, align 4
-  %i1 = fpext float %i1f to double
-  %mul = fmul double %i0, %i1
-  %arrayidx3 = getelementptr inbounds float, float* %a, i64 1
-  %i3f = load float, float* %arrayidx3, align 4
-  %i3 = fpext float %i3f to double
-  %arrayidx4 = getelementptr inbounds float, float* %b, i64 1
-  %i4f = load float, float* %arrayidx4, align 4
-  %i4 = fpext float %i4f to double
-  %mul5 = fmul double %i3, %i4
-  store double %mul, double* %c, align 8
-  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
-  store double %mul5, double* %arrayidx5, align 8
-  ret void
-; CHECK-LABEL: @test2(
-; CHECK: %i0f.v.i0 = bitcast float* %a to <2 x float>*
-; CHECK: %i1f.v.i0 = bitcast float* %b to <2 x float>*
-; CHECK: %i0f = load <2 x float>, <2 x float>* %i0f.v.i0, align 4
-; CHECK: %i0 = fpext <2 x float> %i0f to <2 x double>
-; CHECK: %i1f = load <2 x float>, <2 x float>* %i1f.v.i0, align 4
-; CHECK: %i1 = fpext <2 x float> %i1f to <2 x double>
-; CHECK: %mul = fmul <2 x double> %i0, %i1
-; CHECK: %0 = bitcast double* %c to <2 x double>*
-; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
-; CHECK: ret void
-; CHECK-AO-LABEL: @test2(
-; CHECK-AO-NOT: <2 x double>
-}
-
-; Simple chain with loads and truncating stores
-define void @test3(double* %a, double* %b, float* %c) nounwind uwtable readonly {
-entry:
-  %i0 = load double, double* %a, align 8
-  %i1 = load double, double* %b, align 8
-  %mul = fmul double %i0, %i1
-  %mulf = fptrunc double %mul to float
-  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
-  %i3 = load double, double* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
-  %i4 = load double, double* %arrayidx4, align 8
-  %mul5 = fmul double %i3, %i4
-  %mul5f = fptrunc double %mul5 to float
-  store float %mulf, float* %c, align 8
-  %arrayidx5 = getelementptr inbounds float, float* %c, i64 1
-  store float %mul5f, float* %arrayidx5, align 4
-  ret void
-; CHECK-LABEL: @test3(
-; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
-; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
-; CHECK: %i0 = load <2 x double>, <2 x double>* %i0.v.i0, align 8
-; CHECK: %i1 = load <2 x double>, <2 x double>* %i1.v.i0, align 8
-; CHECK: %mul = fmul <2 x double> %i0, %i1
-; CHECK: %mulf = fptrunc <2 x double> %mul to <2 x float>
-; CHECK: %0 = bitcast float* %c to <2 x float>*
-; CHECK: store <2 x float> %mulf, <2 x float>* %0, align 8
-; CHECK: ret void
-; CHECK-AO-LABEL: @test3(
-; CHECK-AO: %i0 = load double, double* %a, align 8
-; CHECK-AO: %i1 = load double, double* %b, align 8
-; CHECK-AO: %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
-; CHECK-AO: %i3 = load double, double* %arrayidx3, align 8
-; CHECK-AO: %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
-; CHECK-AO: %i4 = load double, double* %arrayidx4, align 8
-; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
-; CHECK-AO: %mul.v.i1.2 = insertelement <2 x double> %mul.v.i1.1, double %i4, i32 1
-; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
-; CHECK-AO: %mul.v.i0.2 = insertelement <2 x double> %mul.v.i0.1, double %i3, i32 1
-; CHECK-AO: %mul = fmul <2 x double> %mul.v.i0.2, %mul.v.i1.2
-; CHECK-AO: %mulf = fptrunc <2 x double> %mul to <2 x float>
-; CHECK-AO: %0 = bitcast float* %c to <2 x float>*
-; CHECK-AO: store <2 x float> %mulf, <2 x float>* %0, align 8
-; CHECK-AO: ret void
-}
-
-; Simple 3-pair chain with loads and stores (unreachable)
-define void @test4(i1 %bool, double* %a, double* %b, double* %c) nounwind uwtable readonly {
-entry:
-  br i1 %bool, label %if.then1, label %if.end
-
-if.then1:
-  unreachable
-  br label %if.then
-
-if.then:
-  %i0 = load double, double* %a, align 8
-  %i1 = load double, double* %b, align 8
-  %mul = fmul double %i0, %i1
-  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
-  %i3 = load double, double* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
-  %i4 = load double, double* %arrayidx4, align 8
-  %mul5 = fmul double %i3, %i4
-  store double %mul, double* %c, align 8
-  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
-  store double %mul5, double* %arrayidx5, align 8
-  br label %if.end
-
-if.end:
-  ret void
-; CHECK-LABEL: @test4(
-; CHECK-NOT: <2 x double>
-; CHECK-AO-LABEL: @test4(
-; CHECK-AO-NOT: <2 x double>
-}
-
-; Simple 3-pair chain with loads and stores
-define void @test5(double* %a, double* %b, double* %c) nounwind uwtable readonly {
-entry:
-  %i0 = load double, double* %a, align 8
-  %i1 = load double, double* %b, align 8
-  %mul = fmul double %i0, %i1
-  %arrayidx3 = getelementptr inbounds double, double* %a, i64 1
-  %i3 = load double, double* %arrayidx3, align 8
-  %arrayidx4 = getelementptr inbounds double, double* %b, i64 1
-  %i4 = load double, double* %arrayidx4, align 8
-  %mul5 = fmul double %i3, %i4
-  %arrayidx5 = getelementptr inbounds double, double* %c, i64 1
-  store double %mul5, double* %arrayidx5, align 8
-  store double %mul, double* %c, align 4
-  ret void
-; CHECK-LABEL: @test5(
-; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
-; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
-; CHECK: %i0 = load <2 x double>, <2 x double>* %i0.v.i0, align 8
-; CHECK: %i1 = load <2 x double>, <2 x double>* %i1.v.i0, align 8
-; CHECK: %mul = fmul <2 x double> %i0, %i1
-; CHECK: %0 = bitcast double* %c to <2 x double>*
-; CHECK: store <2 x double> %mul, <2 x double>* %0, align 4
-; CHECK: ret void
-; CHECK-AO-LABEL: @test5(
-; CHECK-AO-NOT: <2 x double>
-}
-
diff --git a/test/Transforms/BBVectorize/simple-sel.ll b/test/Transforms/BBVectorize/simple-sel.ll
deleted file mode 100644
index 269b07f82d1..00000000000
--- a/test/Transforms/BBVectorize/simple-sel.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-no-bools -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-NB
-
-; Basic depth-3 chain with select
-define double @test1(double %A1, double %A2, double %B1, double %B2, i1 %C1, i1 %C2) {
-; CHECK-LABEL: @test1(
-; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
-; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
-	%X1 = fsub double %A1, %B1
-	%X2 = fsub double %A2, %B2
-; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
-	%Y1 = fmul double %X1, %A1
-	%Y2 = fmul double %X2, %A2
-; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
-        %Z1 = select i1 %C1, double %Y1, double %B1
-        %Z2 = select i1 %C2, double %Y2, double %B2
-; CHECK: %Z1.v.i0.1 = insertelement <2 x i1> undef, i1 %C1, i32 0
-; CHECK: %Z1.v.i0.2 = insertelement <2 x i1> %Z1.v.i0.1, i1 %C2, i32 1
-; CHECK: %Z1 = select <2 x i1> %Z1.v.i0.2, <2 x double> %Y1, <2 x double> %X1.v.i1.2
-	%R  = fmul double %Z1, %Z2
-; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
-; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
-; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
-	ret double %R
-; CHECK: ret double %R
-}
-
-; Basic depth-3 chain with select (and vect. compare)
-define double @test2(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @test2(
-; CHECK-NB-LABEL: @test2(
-; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
-; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
-	%X1 = fsub double %A1, %B1
-	%X2 = fsub double %A2, %B2
-; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
-	%Y1 = fmul double %X1, %A1
-	%Y2 = fmul double %X2, %A2
-; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
-	%C1 = fcmp ogt double %X1, %A1
-        %C2 = fcmp ogt double %X2, %A2
-; CHECK: %C1 = fcmp ogt <2 x double> %X1, %X1.v.i0.2
-; CHECK-NB: fcmp ogt double
-        %Z1 = select i1 %C1, double %Y1, double %B1
-        %Z2 = select i1 %C2, double %Y2, double %B2
-; CHECK: %Z1 = select <2 x i1> %C1, <2 x double> %Y1, <2 x double> %X1.v.i1.2
-	%R  = fmul double %Z1, %Z2
-; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
-; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
-; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
-	ret double %R
-; CHECK: ret double %R
-}
-
diff --git a/test/Transforms/BBVectorize/simple-tst.ll b/test/Transforms/BBVectorize/simple-tst.ll
deleted file mode 100644
index 6a88e1b09c1..00000000000
--- a/test/Transforms/BBVectorize/simple-tst.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux"
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-vector-bits=256 -instcombine -gvn -S | FileCheck %s
-
-; Basic depth-3 chain (target-specific type should not vectorize)
-define ppc_fp128 @test7(ppc_fp128 %A1, ppc_fp128 %A2, ppc_fp128 %B1, ppc_fp128 %B2) {
-; CHECK-LABEL: @test7(
-; CHECK-NOT: <2 x ppc_fp128>
-	%X1 = fsub ppc_fp128 %A1, %B1
-	%X2 = fsub ppc_fp128 %A2, %B2
-	%Y1 = fmul ppc_fp128 %X1, %A1
-	%Y2 = fmul ppc_fp128 %X2, %A2
-	%Z1 = fadd ppc_fp128 %Y1, %B1
-	%Z2 = fadd ppc_fp128 %Y2, %B2
-	%R  = fmul ppc_fp128 %Z1, %Z2
-	ret ppc_fp128 %R
-}
-
diff --git a/test/Transforms/BBVectorize/simple.ll b/test/Transforms/BBVectorize/simple.ll
deleted file mode 100644
index 12f97ab77ba..00000000000
--- a/test/Transforms/BBVectorize/simple.ll
+++ /dev/null
@@ -1,209 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-
-; Basic depth-3 chain
-define double @test1(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]]
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = fmul double %X1, %A1
-  %Y2 = fmul double %X2, %A2
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain (last pair permuted)
-define double @test2(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]]
-; CHECK-NEXT:    [[Z1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B2]], i32 0
-; CHECK-NEXT:    [[Z1_V_I1_2:%.*]] = insertelement <2 x double> [[Z1_V_I1_1]], double [[B1]], i32 1
-; CHECK-NEXT:    [[Z2:%.*]] = fadd <2 x double> [[Y1]], [[Z1_V_I1_2]]
-; CHECK-NEXT:    [[Z2_V_R1:%.*]] = extractelement <2 x double> [[Z2]], i32 0
-; CHECK-NEXT:    [[Z2_V_R2:%.*]] = extractelement <2 x double> [[Z2]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z2_V_R2]], [[Z2_V_R1]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = fmul double %X1, %A1
-  %Y2 = fmul double %X2, %A2
-  %Z1 = fadd double %Y2, %B1
-  %Z2 = fadd double %Y1, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain (last pair first splat)
-define double @test3(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]]
-; CHECK-NEXT:    [[Z1_V_I0:%.*]] = shufflevector <2 x double> [[Y1]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Z1_V_I0]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = fmul double %X1, %A1
-  %Y2 = fmul double %X2, %A2
-  %Z1 = fadd double %Y2, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain (last pair second splat)
-define double @test4(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]]
-; CHECK-NEXT:    [[Z1_V_I0:%.*]] = shufflevector <2 x double> [[Y1]], <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Z1_V_I0]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = fmul double %X1, %A1
-  %Y2 = fmul double %X2, %A2
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y1, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain
-define <2 x float> @test5(<2 x float> %A1, <2 x float> %A2, <2 x float> %B1, <2 x float> %B2) {
-; CHECK-LABEL: @test5(
-; CHECK-NEXT:    [[X1_V_I1:%.*]] = shufflevector <2 x float> [[B1:%.*]], <2 x float> [[B2:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[X1_V_I0:%.*]] = shufflevector <2 x float> [[A1:%.*]], <2 x float> [[A2:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[X1:%.*]] = fsub <4 x float> [[X1_V_I0]], [[X1_V_I1]]
-; CHECK-NEXT:    [[Y1:%.*]] = fmul <4 x float> [[X1]], [[X1_V_I0]]
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <4 x float> [[Y1]], [[X1_V_I1]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = shufflevector <4 x float> [[Z1]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = shufflevector <4 x float> [[Z1]], <4 x float> undef, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[R:%.*]] = fmul <2 x float> [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret <2 x float> [[R]]
-;
-  %X1 = fsub <2 x float> %A1, %B1
-  %X2 = fsub <2 x float> %A2, %B2
-  %Y1 = fmul <2 x float> %X1, %A1
-  %Y2 = fmul <2 x float> %X2, %A2
-  %Z1 = fadd <2 x float> %Y1, %B1
-  %Z2 = fadd <2 x float> %Y2, %B2
-  %R  = fmul <2 x float> %Z1, %Z2
-  ret <2 x float> %R
-}
-
-; Basic chain with shuffles
-define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
-; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[X1_V_I1:%.*]] = shufflevector <8 x i8> [[B1:%.*]], <8 x i8> [[B2:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[X1_V_I0:%.*]] = shufflevector <8 x i8> [[A1:%.*]], <8 x i8> [[A2:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[X1:%.*]] = sub <16 x i8> [[X1_V_I0]], [[X1_V_I1]]
-; CHECK-NEXT:    [[Y1:%.*]] = mul <16 x i8> [[X1]], [[X1_V_I0]]
-; CHECK-NEXT:    [[Z1:%.*]] = add <16 x i8> [[Y1]], [[X1_V_I1]]
-; CHECK-NEXT:    [[Q1_V_I1:%.*]] = shufflevector <16 x i8> [[Z1]], <16 x i8> undef, <16 x i32> <i32 8, i32 undef, i32 10, i32 undef, i32 undef, i32 13, i32 undef, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[Q1:%.*]] = shufflevector <16 x i8> [[Z1]], <16 x i8> [[Q1_V_I1]], <16 x i32> <i32 23, i32 16, i32 6, i32 1, i32 21, i32 18, i32 4, i32 3, i32 14, i32 15, i32 8, i32 9, i32 10, i32 12, i32 12, i32 9>
-; CHECK-NEXT:    [[Q1_V_R1:%.*]] = shufflevector <16 x i8> [[Q1]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[Q1_V_R2:%.*]] = shufflevector <16 x i8> [[Q1]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[R:%.*]] = mul <8 x i8> [[Q1_V_R1]], [[Q1_V_R2]]
-; CHECK-NEXT:    ret <8 x i8> [[R]]
-;
-  %X1 = sub <8 x i8> %A1, %B1
-  %X2 = sub <8 x i8> %A2, %B2
-  %Y1 = mul <8 x i8> %X1, %A1
-  %Y2 = mul <8 x i8> %X2, %A2
-  %Z1 = add <8 x i8> %Y1, %B1
-  %Z2 = add <8 x i8> %Y2, %B2
-  %Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32> <i32 15, i32 8, i32 6, i32 1, i32 13, i32 10, i32 4, i32 3>
-  %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 4, i32 4, i32 1>
-  %R  = mul <8 x i8> %Q1, %Q2
-  ret <8 x i8> %R
-}
-
-; Basic depth-3 chain (flipped order)
-define double @test7(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @test7(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x double> [[X1_V_I1_1]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x double> [[X1_V_I0_1]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = fsub <2 x double> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = fmul <2 x double> [[X1]], [[X1_V_I0_2]]
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <2 x double> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = fmul double %X1, %A1
-  %Y2 = fmul double %X2, %A2
-  %Z2 = fadd double %Y2, %B2
-  %Z1 = fadd double %Y1, %B1
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}
-
-; Basic depth-3 chain (subclass data)
-define i64 @test8(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
-; CHECK-LABEL: @test8(
-; CHECK-NEXT:    [[X1_V_I1_1:%.*]] = insertelement <2 x i64> undef, i64 [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_2:%.*]] = insertelement <2 x i64> [[X1_V_I1_1]], i64 [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0_1:%.*]] = insertelement <2 x i64> undef, i64 [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_2:%.*]] = insertelement <2 x i64> [[X1_V_I0_1]], i64 [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1:%.*]] = sub <2 x i64> [[X1_V_I0_2]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Y1:%.*]] = mul <2 x i64> [[X1]], [[X1_V_I0_2]]
-; CHECK-NEXT:    [[Z1:%.*]] = add <2 x i64> [[Y1]], [[X1_V_I1_2]]
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <2 x i64> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <2 x i64> [[Z1]], i32 1
-; CHECK-NEXT:    [[R:%.*]] = mul i64 [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    ret i64 [[R]]
-;
-  %X1 = sub nsw i64 %A1, %B1
-  %X2 = sub i64 %A2, %B2
-  %Y1 = mul i64 %X1, %A1
-  %Y2 = mul i64 %X2, %A2
-  %Z1 = add i64 %Y1, %B1
-  %Z2 = add i64 %Y2, %B2
-  %R  = mul i64 %Z1, %Z2
-  ret i64 %R
-}
-
diff --git a/test/Transforms/BBVectorize/simple3.ll b/test/Transforms/BBVectorize/simple3.ll
deleted file mode 100644
index 7dd538bdfb0..00000000000
--- a/test/Transforms/BBVectorize/simple3.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-vector-bits=192 -bb-vectorize-ignore-target-info -instcombine -gvn -S | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-
-; Basic depth-3 chain
-define double @test1(double %A1, double %A2, double %A3, double %B1, double %B2, double %B3) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[X1_V_I1_11:%.*]] = insertelement <3 x double> undef, double [[B1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I1_22:%.*]] = insertelement <3 x double> [[X1_V_I1_11]], double [[B2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I1:%.*]] = insertelement <3 x double> [[X1_V_I1_22]], double [[B3:%.*]], i32 2
-; CHECK-NEXT:    [[X1_V_I0_13:%.*]] = insertelement <3 x double> undef, double [[A1:%.*]], i32 0
-; CHECK-NEXT:    [[X1_V_I0_24:%.*]] = insertelement <3 x double> [[X1_V_I0_13]], double [[A2:%.*]], i32 1
-; CHECK-NEXT:    [[X1_V_I0:%.*]] = insertelement <3 x double> [[X1_V_I0_24]], double [[A3:%.*]], i32 2
-; CHECK-NEXT:    [[X1:%.*]] = fsub <3 x double> [[X1_V_I0]], [[X1_V_I1]]
-; CHECK-NEXT:    [[Y1:%.*]] = fmul <3 x double> [[X1]], [[X1_V_I0]]
-; CHECK-NEXT:    [[Z1:%.*]] = fadd <3 x double> [[Y1]], [[X1_V_I1]]
-; CHECK-NEXT:    [[Z1_V_R210:%.*]] = extractelement <3 x double> [[Z1]], i32 2
-; CHECK-NEXT:    [[Z1_V_R1:%.*]] = extractelement <3 x double> [[Z1]], i32 0
-; CHECK-NEXT:    [[Z1_V_R2:%.*]] = extractelement <3 x double> [[Z1]], i32 1
-; CHECK-NEXT:    [[R1:%.*]] = fmul double [[Z1_V_R1]], [[Z1_V_R2]]
-; CHECK-NEXT:    [[R:%.*]] = fmul double [[R1]], [[Z1_V_R210]]
-; CHECK-NEXT:    ret double [[R]]
-;
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %X3 = fsub double %A3, %B3
-  %Y1 = fmul double %X1, %A1
-  %Y2 = fmul double %X2, %A2
-  %Y3 = fmul double %X3, %A3
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %Z3 = fadd double %Y3, %B3
-  %R1 = fmul double %Z1, %Z2
-  %R  = fmul double %R1, %Z3
-  ret double %R
-}
-
diff --git a/test/Transforms/BBVectorize/vector-sel.ll b/test/Transforms/BBVectorize/vector-sel.ll
deleted file mode 100644
index bc15073b5a1..00000000000
--- a/test/Transforms/BBVectorize/vector-sel.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -bb-vectorize -S | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@d = external global [1 x [10 x [1 x i16]]], align 16
-
-define void @test() {
-; CHECK-LABEL: @test(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[BOOL:%.*]] = icmp ne i32 undef, 0
-; CHECK-NEXT:    [[BOOLVEC:%.*]] = icmp ne <4 x i32> undef, zeroinitializer
-; CHECK-NEXT:    br label [[BODY:%.*]]
-; CHECK:       body:
-; CHECK-NEXT:    [[TMP0:%.*]] = select i1 [[BOOL]], <4 x i16> <i16 -2, i16 -2, i16 -2, i16 -2>, <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[BOOL]], <4 x i16> <i16 -2, i16 -2, i16 -2, i16 -2>, <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i1> [[BOOLVEC]], <4 x i1> [[BOOLVEC]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP4:%.*]] = select <8 x i1> [[TMP3]], <8 x i16> <i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3, i16 -3>, <8 x i16> [[TMP2]]
-; CHECK-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr ([1 x [10 x [1 x i16]]], [1 x [10 x [1 x i16]]]* @d, i64 0, i64 0, i64 undef, i64 0) to <8 x i16>*), align 2
-; CHECK-NEXT:    ret void
-;
-entry:
-  %bool = icmp ne i32 undef, 0
-  %boolvec = icmp ne <4 x i32> undef, zeroinitializer
-  br label %body
-
-body:
-  %0 = select i1 %bool, <4 x i16> <i16 -2, i16 -2, i16 -2, i16 -2>, <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>
-  %1 = select i1 %bool, <4 x i16> <i16 -2, i16 -2, i16 -2, i16 -2>, <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>
-  %2 = select <4 x i1> %boolvec, <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>, <4 x i16> %0
-  %3 = select <4 x i1> %boolvec, <4 x i16> <i16 -3, i16 -3, i16 -3, i16 -3>, <4 x i16> %1
-  %4 = add nsw <4 x i16> %2, zeroinitializer
-  %5 = add nsw <4 x i16> %3, zeroinitializer
-  %6 = getelementptr inbounds [1 x [10 x [1 x i16]]], [1 x [10 x [1 x i16]]]* @d, i64 0, i64 0, i64 undef, i64 0
-  %7 = bitcast i16* %6 to <4 x i16>*
-  store <4 x i16> %4, <4 x i16>* %7, align 2
-  %8 = getelementptr [1 x [10 x [1 x i16]]], [1 x [10 x [1 x i16]]]* @d, i64 0, i64 0, i64 undef, i64 4
-  %9 = bitcast i16* %8 to <4 x i16>*
-  store <4 x i16> %5, <4 x i16>* %9, align 2
-  ret void
-}
diff --git a/test/Transforms/BBVectorize/xcore/no-vector-registers.ll b/test/Transforms/BBVectorize/xcore/no-vector-registers.ll
deleted file mode 100644
index 9ebdb7368a3..00000000000
--- a/test/Transforms/BBVectorize/xcore/no-vector-registers.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S -mtriple=xcore | FileCheck %s
-
-target datalayout = "e-p:32:32:32-a0:0:32-n32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f16:16:32-f32:32:32-f64:32:32"
-target triple = "xcore"
-
-; Basic depth-3 chain
-define double @test1(double %A1, double %A2, double %B1, double %B2) {
-; CHECK-LABEL: @test1(
-; CHECK-NOT: <2 x double>
-  %X1 = fsub double %A1, %B1
-  %X2 = fsub double %A2, %B2
-  %Y1 = fmul double %X1, %A1
-  %Y2 = fmul double %X2, %A2
-  %Z1 = fadd double %Y1, %B1
-  %Z2 = fadd double %Y2, %B2
-  %R  = fmul double %Z1, %Z2
-  ret double %R
-}