Add a basic-block autovectorization pass.

This is the initial checkin of the basic-block autovectorization pass along with some supporting vectorization infrastructure. Special thanks to everyone who helped review this code over the last several months (especially Tobias Grosser). llvm-svn: 149468
2025-01-31 12:41:49 +01:00 · 2012-02-01 03:51:43 +00:00 · 2012-02-01 03:51:43 +00:00 · 8cf5de5774
commit 8cf5de5774
parent 6186319c3f
35 changed files with 2635 additions and 12 deletions
--- a/docs/Passes.html
+++ b/docs/Passes.html
@ -126,6 +126,7 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 <tr><td><a href="#adce">-adce</a></td><td>Aggressive Dead Code Elimination</td></tr>
 <tr><td><a href="#always-inline">-always-inline</a></td><td>Inliner for always_inline functions</td></tr>
 <tr><td><a href="#argpromotion">-argpromotion</a></td><td>Promote 'by reference' arguments to scalars</td></tr>
+<tr><td><a href="#bb-vectorize">-bb-vectorize</a></td><td>Combine instructions to form vector instructions within basic blocks</td></tr>
 <tr><td><a href="#block-placement">-block-placement</a></td><td>Profile Guided Basic Block Placement</td></tr>
 <tr><td><a href="#break-crit-edges">-break-crit-edges</a></td><td>Break critical edges in CFG</td></tr>
 <tr><td><a href="#codegenprepare">-codegenprepare</a></td><td>Optimize for code generation</td></tr>
@ -815,6 +816,26 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
  </p>
 </div>

+<!-------------------------------------------------------------------------- -->
+<h3>
+  <a name="bb-vectorize">-bb-vectorize: Basic-Block Vectorization</a>
+</h3>
+<div>
+  <p>This pass combines instructions inside basic blocks to form vector
+  instructions. It iterates over each basic block, attempting to pair
+  compatible instructions, repeating this process until no additional
+  pairs are selected for vectorization. When the outputs of some pair
+  of compatible instructions are used as inputs by some other pair of
+  compatible instructions, those pairs are part of a potential
+  vectorization chain. Instruction pairs are only fused into vector
+  instructions when they are part of a chain longer than some
+  threshold length. Moreover, the pass attempts to find the best
+  possible chain for each pair of compatible instructions. These
+  heuristics are intended to prevent vectorization in cases where
+  it would not yield a performance increase of the resulting code.
+  </p>
+</div>
+
 <!-------------------------------------------------------------------------- -->
 <h3>
  <a name="block-placement">-block-placement: Profile Guided Basic Block Placement</a>
--- a/include/llvm-c/Initialization.h
+++ b/include/llvm-c/Initialization.h
@ -25,6 +25,7 @@ extern "C" {
 void LLVMInitializeCore(LLVMPassRegistryRef R);
 void LLVMInitializeTransformUtils(LLVMPassRegistryRef R);
 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R);
+void LLVMInitializeVectorization(LLVMPassRegistryRef R);
 void LLVMInitializeInstCombine(LLVMPassRegistryRef R);
 void LLVMInitializeIPO(LLVMPassRegistryRef R);
 void LLVMInitializeInstrumentation(LLVMPassRegistryRef R);
--- a/include/llvm-c/Transforms/Vectorize.h
+++ b/include/llvm-c/Transforms/Vectorize.h
@ -0,0 +1,37 @@
+/*===---------------------------Vectorize.h ------------------- -*- C++ -*-===*\
+|*===----------- Vectorization Transformation Library C Interface ---------===*|
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This header declares the C interface to libLLVMVectorize.a, which          *|
+|* implements various vectorization transformations of the LLVM IR.           *|
+|*                                                                            *|
+|* Many exotic languages can interoperate with C code but have a harder time  *|
+|* with C++ due to name mangling. So in addition to C, this interface enables *|
+|* tools written in such languages.                                           *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_TRANSFORMS_VECTORIZE_H
+#define LLVM_C_TRANSFORMS_VECTORIZE_H
+
+#include "llvm-c/Core.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** See llvm::createBBVectorizePass function. */
+void LLVMAddBBVectorizePass(LLVMPassManagerRef PM);
+
+#ifdef __cplusplus
+}
+#endif /* defined(__cplusplus) */
+
+#endif
+
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@ -31,6 +31,10 @@ void initializeTransformUtils(PassRegistry&);
 /// ScalarOpts library.
 void initializeScalarOpts(PassRegistry&);

+/// initializeVectorization - Initialize all passes linked into the
+/// Vectorize library.
+void initializeVectorization(PassRegistry&);
+
 /// initializeInstCombine - Initialize all passes linked into the
 /// ScalarOpts library.
 void initializeInstCombine(PassRegistry&);
@ -236,7 +240,7 @@ void initializeVirtRegMapPass(PassRegistry&);
 void initializeInstSimplifierPass(PassRegistry&);
 void initializeUnpackMachineBundlesPass(PassRegistry&);
 void initializeFinalizeMachineBundlesPass(PassRegistry&);
-
+void initializeBBVectorizePass(PassRegistry&);
 }

 #endif
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@ -31,6 +31,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Vectorize.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include <cstdlib>

@ -151,6 +152,7 @@ namespace {
      (void) llvm::createCorrelatedValuePropagationPass();
      (void) llvm::createMemDepPrinter();
      (void) llvm::createInstructionSimplifierPass();
+      (void) llvm::createBBVectorizePass();

      (void)new llvm::IntervalPartition();
      (void)new llvm::FindUsedTypes();
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@ -99,6 +99,7 @@ public:
  bool DisableSimplifyLibCalls;
  bool DisableUnitAtATime;
  bool DisableUnrollLoops;
+  bool Vectorize;

 private:
  /// ExtensionList - This is list of all of the extensions that are registered.
--- a/include/llvm/Transforms/Vectorize.h
+++ b/include/llvm/Transforms/Vectorize.h
@ -0,0 +1,30 @@
+//===-- Vectorize.h - Vectorization Transformations -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for accessor functions that expose passes
+// in the Vectorize transformations library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_H
+#define LLVM_TRANSFORMS_VECTORIZE_H
+
+namespace llvm {
+
+class BasicBlockPass;
+
+//===----------------------------------------------------------------------===//
+//
+// BBVectorize - A basic-block vectorization pass.
+//
+BasicBlockPass *createBBVectorizePass();
+
+} // End llvm namespace
+
+#endif
--- a/lib/Transforms/CMakeLists.txt
+++ b/lib/Transforms/CMakeLists.txt
@ -3,4 +3,5 @@ add_subdirectory(Instrumentation)
 add_subdirectory(InstCombine)
 add_subdirectory(Scalar)
 add_subdirectory(IPO)
+add_subdirectory(Vectorize)
 add_subdirectory(Hello)
--- a/lib/Transforms/IPO/LLVMBuild.txt
+++ b/lib/Transforms/IPO/LLVMBuild.txt
@ -20,4 +20,4 @@ type = Library
 name = IPO
 parent = Transforms
 library_name = ipo
-required_libraries = Analysis Core IPA InstCombine Scalar Support Target TransformUtils
+required_libraries = Analysis Core IPA InstCombine Scalar Vectorize Support Target TransformUtils
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@ -21,14 +21,20 @@
 #include "llvm/DefaultPasses.h"
 #include "llvm/PassManager.h"
 #include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Vectorize.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ManagedStatic.h"

 using namespace llvm;

+static cl::opt<bool>
+RunVectorization("vectorize", cl::desc("Run vectorization passes"));
+
 PassManagerBuilder::PassManagerBuilder() {
    OptLevel = 2;
    SizeLevel = 0;
@ -37,6 +43,7 @@ PassManagerBuilder::PassManagerBuilder() {
    DisableSimplifyLibCalls = false;
    DisableUnitAtATime = false;
    DisableUnrollLoops = false;
+    Vectorize = RunVectorization;
 }

 PassManagerBuilder::~PassManagerBuilder() {
@ -172,6 +179,13 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {

  addExtensionsToPM(EP_ScalarOptimizerLate, MPM);

+  if (Vectorize) {
+    MPM.add(createBBVectorizePass());
+    MPM.add(createInstructionCombiningPass());
+    if (OptLevel > 1)
+      MPM.add(createGVNPass());                 // Remove redundancies
+  }
+
  MPM.add(createAggressiveDCEPass());         // Delete dead instructions
  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
  MPM.add(createInstructionCombiningPass());  // Clean up after everything.
--- a/lib/Transforms/LLVMBuild.txt
+++ b/lib/Transforms/LLVMBuild.txt
@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;

 [common]
-subdirectories = IPO InstCombine Instrumentation Scalar Utils
+subdirectories = IPO InstCombine Instrumentation Scalar Utils Vectorize

 [component_0]
 type = Group
--- a/lib/Transforms/Makefile
+++ b/lib/Transforms/Makefile
@ -8,7 +8,7 @@
 ##===----------------------------------------------------------------------===##

 LEVEL = ../..
-PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Hello
+PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Vectorize Hello

 include $(LEVEL)/Makefile.config

--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@ -0,0 +1,4 @@
+add_llvm_library(LLVMVectorize
+  BBVectorize.cpp
+  Vectorize.cpp
+  )
--- a/lib/Transforms/Vectorize/LLVMBuild.txt
+++ b/lib/Transforms/Vectorize/LLVMBuild.txt
@ -0,0 +1,24 @@
+;===- ./lib/Transforms/Scalar/LLVMBuild.txt --------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = Vectorize
+parent = Transforms
+library_name = Vectorize
+required_libraries = Analysis Core InstCombine Support Target TransformUtils
+
--- a/lib/Transforms/Vectorize/Makefile
+++ b/lib/Transforms/Vectorize/Makefile
@ -0,0 +1,15 @@
+##===- lib/Transforms/Vectorize/Makefile -----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMVectorize
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@ -0,0 +1,39 @@
+//===-- Vectorize.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMVectorizeOpts.a, which 
+// implements several vectorization transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Transforms/Vectorize.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassManager.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Transforms/Vectorize.h"
+
+using namespace llvm;
+
+/// initializeVectorizationPasses - Initialize all passes linked into the 
+/// Vectorization library.
+void llvm::initializeVectorization(PassRegistry &Registry) {
+  initializeBBVectorizePass(Registry);
+}
+
+void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
+  initializeVectorization(*unwrap(R));
+}
+
+void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createBBVectorizePass());
+}
+
--- a/test/Transforms/BBVectorize/cycle.ll
+++ b/test/Transforms/BBVectorize/cycle.ll
@ -0,0 +1,112 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+; This test checks the non-trivial pairing-induced cycle avoidance. Without this cycle avoidance, the algorithm would otherwise
+; want to select the pairs:
+; %div77 = fdiv double %sub74, %mul76.v.r1 <->   %div125 = fdiv double %mul121, %mul76.v.r2 (div125 depends on mul117)
+; %add84 = fadd double %sub83, 2.000000e+00 <->   %add127 = fadd double %mul126, 1.000000e+00 (add127 depends on div77)
+; %mul95 = fmul double %sub45.v.r1, %sub36.v.r1 <->   %mul88 = fmul double %sub36.v.r1, %sub87 (mul88 depends on add84)
+; %mul117 = fmul double %sub39.v.r1, %sub116 <->   %mul97 = fmul double %mul96, %sub39.v.r1 (mul97 depends on mul95)
+; and so a dependency cycle would be created.
+
+declare double @fabs(double) nounwind readnone
+define void @test1(double %a, double %b, double %c, double %add80, double %mul1, double %mul2.v.r1, double %mul73, double %sub, double %sub65, double %F.0, i32 %n.0, double %Bnm3.0, double %Bnm2.0, double %Bnm1.0, double %Anm3.0, double %Anm2.0, double %Anm1.0) {
+entry:
+  br label %go
+go:
+  %conv = sitofp i32 %n.0 to double
+  %add35 = fadd double %conv, %a
+  %sub36 = fadd double %add35, -1.000000e+00
+  %add38 = fadd double %conv, %b
+  %sub39 = fadd double %add38, -1.000000e+00
+  %add41 = fadd double %conv, %c
+  %sub42 = fadd double %add41, -1.000000e+00
+  %sub45 = fadd double %add35, -2.000000e+00
+  %sub48 = fadd double %add38, -2.000000e+00
+  %sub51 = fadd double %add41, -2.000000e+00
+  %mul52 = shl nsw i32 %n.0, 1
+  %sub53 = add nsw i32 %mul52, -1
+  %conv54 = sitofp i32 %sub53 to double
+  %sub56 = add nsw i32 %mul52, -3
+  %conv57 = sitofp i32 %sub56 to double
+  %sub59 = add nsw i32 %mul52, -5
+  %conv60 = sitofp i32 %sub59 to double
+  %mul61 = mul nsw i32 %n.0, %n.0
+  %conv62 = sitofp i32 %mul61 to double
+  %mul63 = fmul double %conv62, 3.000000e+00
+  %mul67 = fmul double %sub65, %conv
+  %add68 = fadd double %mul63, %mul67
+  %add69 = fadd double %add68, 2.000000e+00
+  %sub71 = fsub double %add69, %mul2.v.r1
+  %sub74 = fsub double %sub71, %mul73
+  %mul75 = fmul double %conv57, 2.000000e+00
+  %mul76 = fmul double %mul75, %sub42
+  %div77 = fdiv double %sub74, %mul76
+  %mul82 = fmul double %add80, %conv
+  %sub83 = fsub double %mul63, %mul82
+  %add84 = fadd double %sub83, 2.000000e+00
+  %sub86 = fsub double %add84, %mul2.v.r1
+  %sub87 = fsub double -0.000000e+00, %sub86
+  %mul88 = fmul double %sub36, %sub87
+  %mul89 = fmul double %mul88, %sub39
+  %mul90 = fmul double %conv54, 4.000000e+00
+  %mul91 = fmul double %mul90, %conv57
+  %mul92 = fmul double %mul91, %sub51
+  %mul93 = fmul double %mul92, %sub42
+  %div94 = fdiv double %mul89, %mul93
+  %mul95 = fmul double %sub45, %sub36
+  %mul96 = fmul double %mul95, %sub48
+  %mul97 = fmul double %mul96, %sub39
+  %sub99 = fsub double %conv, %a
+  %sub100 = fadd double %sub99, -2.000000e+00
+  %mul101 = fmul double %mul97, %sub100
+  %sub103 = fsub double %conv, %b
+  %sub104 = fadd double %sub103, -2.000000e+00
+  %mul105 = fmul double %mul101, %sub104
+  %mul106 = fmul double %conv57, 8.000000e+00
+  %mul107 = fmul double %mul106, %conv57
+  %mul108 = fmul double %mul107, %conv60
+  %sub111 = fadd double %add41, -3.000000e+00
+  %mul112 = fmul double %mul108, %sub111
+  %mul113 = fmul double %mul112, %sub51
+  %mul114 = fmul double %mul113, %sub42
+  %div115 = fdiv double %mul105, %mul114
+  %sub116 = fsub double -0.000000e+00, %sub36
+  %mul117 = fmul double %sub39, %sub116
+  %sub119 = fsub double %conv, %c
+  %sub120 = fadd double %sub119, -1.000000e+00
+  %mul121 = fmul double %mul117, %sub120
+  %mul123 = fmul double %mul75, %sub51
+  %mul124 = fmul double %mul123, %sub42
+  %div125 = fdiv double %mul121, %mul124
+  %mul126 = fmul double %div77, %sub
+  %add127 = fadd double %mul126, 1.000000e+00
+  %mul128 = fmul double %add127, %Anm1.0
+  %mul129 = fmul double %div94, %sub
+  %add130 = fadd double %div125, %mul129
+  %mul131 = fmul double %add130, %sub
+  %mul132 = fmul double %mul131, %Anm2.0
+  %add133 = fadd double %mul128, %mul132
+  %mul134 = fmul double %div115, %mul1
+  %mul135 = fmul double %mul134, %Anm3.0
+  %add136 = fadd double %add133, %mul135
+  %mul139 = fmul double %add127, %Bnm1.0
+  %mul143 = fmul double %mul131, %Bnm2.0
+  %add144 = fadd double %mul139, %mul143
+  %mul146 = fmul double %mul134, %Bnm3.0
+  %add147 = fadd double %add144, %mul146
+  %div148 = fdiv double %add136, %add147
+  %sub149 = fsub double %F.0, %div148
+  %div150 = fdiv double %sub149, %F.0
+  %call = tail call double @fabs(double %div150) nounwind readnone
+  %cmp = fcmp olt double %call, 0x3CB0000000000000
+  %cmp152 = icmp sgt i32 %n.0, 20000
+  %or.cond = or i1 %cmp, %cmp152
+  br i1 %or.cond, label %done, label %go
+done:
+  ret void
+; CHECK: @test1
+; CHECK: go:
+; CHECK-NEXT: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
+; FIXME: When tree pruning is deterministic, include the entire output.
+}
--- a/test/Transforms/BBVectorize/dg.exp
+++ b/test/Transforms/BBVectorize/dg.exp
@ -0,0 +1,3 @@
+load_lib llvm.exp
+
+RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,c,cpp}]]
--- a/test/Transforms/BBVectorize/ld1.ll
+++ b/test/Transforms/BBVectorize/ld1.ll
@ -0,0 +1,41 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+define double @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %i2 = load double* %c, align 8
+  %add = fadd double %mul, %i2
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %arrayidx6 = getelementptr inbounds double* %c, i64 1
+  %i5 = load double* %arrayidx6, align 8
+  %add7 = fadd double %mul5, %i5
+  %mul9 = fmul double %add, %i1
+  %add11 = fadd double %mul9, %i2
+  %mul13 = fmul double %add7, %i4
+  %add15 = fadd double %mul13, %i5
+  %mul16 = fmul double %add11, %add15
+  ret double %mul16
+; CHECK: @test1
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i2.v.i0 = bitcast double* %c to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %i2 = load <2 x double>* %i2.v.i0, align 8
+; CHECK: %add = fadd <2 x double> %mul, %i2
+; CHECK: %mul9 = fmul <2 x double> %add, %i1
+; CHECK: %add11 = fadd <2 x double> %mul9, %i2
+; CHECK: %add11.v.r1 = extractelement <2 x double> %add11, i32 0
+; CHECK: %add11.v.r2 = extractelement <2 x double> %add11, i32 1
+; CHECK: %mul16 = fmul double %add11.v.r1, %add11.v.r2
+; CHECK: ret double %mul16
+}
+
--- a/test/Transforms/BBVectorize/loop1.ll
+++ b/test/Transforms/BBVectorize/loop1.ll
@ -0,0 +1,93 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
+; The second check covers the use of alias analysis (with loop unrolling).
+
+define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
+entry:
+  br label %for.body
+; CHECK: @test1
+; CHECK-UNRL: @test1
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
+  %1 = load double* %arrayidx2, align 8
+  %mul = fmul double %0, %0
+  %mul3 = fmul double %0, %1
+  %add = fadd double %mul, %mul3
+  %add4 = fadd double %1, %1
+  %add5 = fadd double %add4, %0
+  %mul6 = fmul double %0, %add5
+  %add7 = fadd double %add, %mul6
+  %mul8 = fmul double %1, %1
+  %add9 = fadd double %0, %0
+  %add10 = fadd double %add9, %0
+  %mul11 = fmul double %mul8, %add10
+  %add12 = fadd double %add7, %mul11
+  %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
+  store double %add12, double* %arrayidx14, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %for.end, label %for.body
+; CHECK: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
+; CHECK: %0 = load double* %arrayidx, align 8
+; CHECK: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
+; CHECK: %1 = load double* %arrayidx2, align 8
+; CHECK: %mul = fmul double %0, %0
+; CHECK: %mul3 = fmul double %0, %1
+; CHECK: %add = fadd double %mul, %mul3
+; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
+; CHECK: %mul8 = fmul double %1, %1
+; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1
+; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2
+; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0
+; CHECK: %add5.v.i1.2 = insertelement <2 x double> %add5.v.i1.1, double %0, i32 1
+; CHECK: %add5 = fadd <2 x double> %add4, %add5.v.i1.2
+; CHECK: %mul6.v.i0.2 = insertelement <2 x double> %add5.v.i1.1, double %mul8, i32 1
+; CHECK: %mul6 = fmul <2 x double> %mul6.v.i0.2, %add5
+; CHECK: %mul6.v.r1 = extractelement <2 x double> %mul6, i32 0
+; CHECK: %mul6.v.r2 = extractelement <2 x double> %mul6, i32 1
+; CHECK: %add7 = fadd double %add, %mul6.v.r1
+; CHECK: %add12 = fadd double %add7, %mul6.v.r2
+; CHECK: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
+; CHECK: store double %add12, double* %arrayidx14, align 8
+; CHECK: %indvars.iv.next = add i64 %indvars.iv, 1
+; CHECK: %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, 10
+; CHECK: br i1 %exitcond, label %for.end, label %for.body
+; CHECK-UNRL: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ]
+; CHECK-UNRL: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
+; CHECK-UNRL: %0 = bitcast double* %arrayidx to <2 x double>*
+; CHECK-UNRL: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
+; CHECK-UNRL: %1 = bitcast double* %arrayidx2 to <2 x double>*
+; CHECK-UNRL: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
+; CHECK-UNRL: %2 = load <2 x double>* %0, align 8
+; CHECK-UNRL: %3 = load <2 x double>* %1, align 8
+; CHECK-UNRL: %mul = fmul <2 x double> %2, %2
+; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3
+; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3
+; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3
+; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2
+; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5
+; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6
+; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3
+; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2
+; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2
+; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10
+; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11
+; CHECK-UNRL: %4 = bitcast double* %arrayidx14 to <2 x double>*
+; CHECK-UNRL: store <2 x double> %add12, <2 x double>* %4, align 8
+; CHECK-UNRL: %indvars.iv.next.1 = add i64 %indvars.iv, 2
+; CHECK-UNRL: %lftr.wideiv.1 = trunc i64 %indvars.iv.next.1 to i32
+; CHECK-UNRL: %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 10
+; CHECK-UNRL: br i1 %exitcond.1, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
--- a/test/Transforms/BBVectorize/req-depth.ll
+++ b/test/Transforms/BBVectorize/req-depth.ll
@ -0,0 +1,17 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 3 -S | FileCheck %s -check-prefix=CHECK-RD3
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 2 -S | FileCheck %s -check-prefix=CHECK-RD2
+
+define double @test1(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%R  = fmul double %Y1, %Y2
+	ret double %R
+; CHECK-RD3: @test1
+; CHECK-RD2: @test1
+; CHECK-RD3-NOT: <2 x double>
+; CHECK-RD2: <2 x double>
+}
+
--- a/test/Transforms/BBVectorize/search-limit.ll
+++ b/test/Transforms/BBVectorize/search-limit.ll
@ -0,0 +1,46 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-search-limit=4 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-SL4
+
+define double @test1(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test1
+; CHECK-SL4: @test1
+; CHECK-SL4-NOT: <2 x double>
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z1 = fadd double %Y1, %B1
+        ; Here we have a dependency chain: the short search limit will not
+        ; see past this chain and so will not see the second part of the
+        ; pair to vectorize.
+        %mul41 = fmul double %Z1, %Y2
+        %sub48 = fsub double %Z1, %mul41
+        %mul62 = fmul double %Z1, %sub48
+        %sub69 = fsub double %Z1, %mul62
+        %mul83 = fmul double %Z1, %sub69
+        %sub90 = fsub double %Z1, %mul83
+        %mul104 = fmul double %Z1, %sub90
+        %sub111 = fsub double %Z1, %mul104
+        %mul125 = fmul double %Z1, %sub111
+        %sub132 = fsub double %Z1, %mul125
+        %mul146 = fmul double %Z1, %sub132
+        %sub153 = fsub double %Z1, %mul146
+        ; end of chain.
+	%Z2 = fadd double %Y2, %B2
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+	%R1  = fdiv double %Z1, %Z2
+        %R   = fmul double %R1, %sub153
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R1 = fdiv double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
--- a/test/Transforms/BBVectorize/simple-int.ll
+++ b/test/Transforms/BBVectorize/simple-int.ll
@ -0,0 +1,59 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+declare double @llvm.fma.f64(double, double, double)
+declare double @llvm.cos.f64(double)
+
+; Basic depth-3 chain with fma
+define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.fma.f64(double %X1, double %A1, double %C1)
+	%Y2 = call double @llvm.fma.f64(double %X2, double %A2, double %C2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test1
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1.v.i2.1 = insertelement <2 x double> undef, double %C1, i32 0
+; CHECK: %Y1.v.i2.2 = insertelement <2 x double> %Y1.v.i2.1, double %C2, i32 1
+; CHECK: %Y1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %X1, <2 x double> %X1.v.i0.2, <2 x double> %Y1.v.i2.2)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain with cos
+define double @test2(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.cos.f64(double %X1)
+	%Y2 = call double @llvm.cos.f64(double %X2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test2
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = call <2 x double> @llvm.cos.v2f64(<2 x double> %X1)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+}
+
+; CHECK: declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) nounwind readonly
+
--- a/test/Transforms/BBVectorize/simple-ldstr.ll
+++ b/test/Transforms/BBVectorize/simple-ldstr.ll
@ -0,0 +1,110 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO
+
+; Simple 3-pair chain with loads and stores
+define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+; CHECK: @test1
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %0 = bitcast double* %c to <2 x double>*
+; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
+; CHECK: ret void
+; CHECK-AO: @test1
+; CHECK-AO-NOT: <2 x double>
+}
+
+; Simple chain with extending loads and stores
+define void @test2(float* %a, float* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0f = load float* %a, align 4
+  %i0 = fpext float %i0f to double
+  %i1f = load float* %b, align 4
+  %i1 = fpext float %i1f to double
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds float* %a, i64 1
+  %i3f = load float* %arrayidx3, align 4
+  %i3 = fpext float %i3f to double
+  %arrayidx4 = getelementptr inbounds float* %b, i64 1
+  %i4f = load float* %arrayidx4, align 4
+  %i4 = fpext float %i4f to double
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+; CHECK: @test2
+; CHECK: %i0f.v.i0 = bitcast float* %a to <2 x float>*
+; CHECK: %i1f.v.i0 = bitcast float* %b to <2 x float>*
+; CHECK: %i0f = load <2 x float>* %i0f.v.i0, align 4
+; CHECK: %i0 = fpext <2 x float> %i0f to <2 x double>
+; CHECK: %i1f = load <2 x float>* %i1f.v.i0, align 4
+; CHECK: %i1 = fpext <2 x float> %i1f to <2 x double>
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %0 = bitcast double* %c to <2 x double>*
+; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
+; CHECK: ret void
+; CHECK-AO: @test2
+; CHECK-AO-NOT: <2 x double>
+}
+
+; Simple chain with loads and truncating stores
+define void @test3(double* %a, double* %b, float* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %mulf = fptrunc double %mul to float
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %mul5f = fptrunc double %mul5 to float
+  store float %mulf, float* %c, align 8
+  %arrayidx5 = getelementptr inbounds float* %c, i64 1
+  store float %mul5f, float* %arrayidx5, align 4
+  ret void
+; CHECK: @test3
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %mulf = fptrunc <2 x double> %mul to <2 x float>
+; CHECK: %0 = bitcast float* %c to <2 x float>*
+; CHECK: store <2 x float> %mulf, <2 x float>* %0, align 8
+; CHECK: ret void
+; CHECK-AO: @test3
+; CHECK-AO: %i0 = load double* %a, align 8
+; CHECK-AO: %i1 = load double* %b, align 8
+; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
+; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
+; CHECK-AO: %arrayidx3 = getelementptr inbounds double* %a, i64 1
+; CHECK-AO: %i3 = load double* %arrayidx3, align 8
+; CHECK-AO: %arrayidx4 = getelementptr inbounds double* %b, i64 1
+; CHECK-AO: %i4 = load double* %arrayidx4, align 8
+; CHECK-AO: %mul.v.i1.2 = insertelement <2 x double> %mul.v.i1.1, double %i4, i32 1
+; CHECK-AO: %mul.v.i0.2 = insertelement <2 x double> %mul.v.i0.1, double %i3, i32 1
+; CHECK-AO: %mul = fmul <2 x double> %mul.v.i0.2, %mul.v.i1.2
+; CHECK-AO: %mulf = fptrunc <2 x double> %mul to <2 x float>
+; CHECK-AO: %0 = bitcast float* %c to <2 x float>*
+; CHECK-AO: store <2 x float> %mulf, <2 x float>* %0, align 8
+; CHECK-AO: ret void
+}
--- a/test/Transforms/BBVectorize/simple.ll
+++ b/test/Transforms/BBVectorize/simple.ll
@ -0,0 +1,152 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+; Basic depth-3 chain
+define double @test1(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test1
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (last pair permuted)
+define double @test2(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test2
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z1 = fadd double %Y2, %B1
+	%Z2 = fadd double %Y1, %B2
+; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (last pair first splat)
+define double @test3(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test3
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z1 = fadd double %Y2, %B1
+	%Z2 = fadd double %Y2, %B2
+; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (last pair second splat)
+define double @test4(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test4
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y1, %B2
+; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain
+define <2 x float> @test5(<2 x float> %A1, <2 x float> %A2, <2 x float> %B1, <2 x float> %B2) {
+; CHECK: @test5
+; CHECK: %X1.v.i1 = shufflevector <2 x float> %B1, <2 x float> %B2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: %X1.v.i0 = shufflevector <2 x float> %A1, <2 x float> %A2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+	%X1 = fsub <2 x float> %A1, %B1
+	%X2 = fsub <2 x float> %A2, %B2
+; CHECK: %X1 = fsub <4 x float> %X1.v.i0, %X1.v.i1
+	%Y1 = fmul <2 x float> %X1, %A1
+	%Y2 = fmul <2 x float> %X2, %A2
+; CHECK: %Y1 = fmul <4 x float> %X1, %X1.v.i0
+	%Z1 = fadd <2 x float> %Y1, %B1
+	%Z2 = fadd <2 x float> %Y2, %B2
+; CHECK: %Z1 = fadd <4 x float> %Y1, %X1.v.i1
+	%R  = fmul <2 x float> %Z1, %Z2
+; CHECK: %Z1.v.r1 = shufflevector <4 x float> %Z1, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+; CHECK: %Z1.v.r2 = shufflevector <4 x float> %Z1, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK: %R = fmul <2 x float> %Z1.v.r1, %Z1.v.r2
+	ret <2 x float> %R
+; CHECK: ret <2 x float> %R
+}
+
+; Basic chain with shuffles
+define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
+; CHECK: @test6
+; CHECK: %X1.v.i1 = shufflevector <8 x i8> %B1, <8 x i8> %B2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK: %X1.v.i0 = shufflevector <8 x i8> %A1, <8 x i8> %A2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+	%X1 = sub <8 x i8> %A1, %B1
+	%X2 = sub <8 x i8> %A2, %B2
+; CHECK: %X1 = sub <16 x i8> %X1.v.i0, %X1.v.i1
+	%Y1 = mul <8 x i8> %X1, %A1
+	%Y2 = mul <8 x i8> %X2, %A2
+; CHECK: %Y1 = mul <16 x i8> %X1, %X1.v.i0
+	%Z1 = add <8 x i8> %Y1, %B1
+	%Z2 = add <8 x i8> %Y2, %B2
+; CHECK: %Z1 = add <16 x i8> %Y1, %X1.v.i1
+        %Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32> <i32 15, i32 8, i32 6, i32 1, i32 13, i32 10, i32 4, i32 3>
+        %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 4, i32 4, i32 1>
+; CHECK: %Z1.v.r2 = shufflevector <16 x i8> %Z1, <16 x i8> undef, <8 x i32> <i32 8, i32 undef, i32 10, i32 undef, i32 undef, i32 13, i32 undef, i32 15>
+; CHECK: %Q1.v.i1 = shufflevector <8 x i8> %Z1.v.r2, <8 x i8> undef, <16 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 5, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK: %Q1 = shufflevector <16 x i8> %Z1, <16 x i8> %Q1.v.i1, <16 x i32> <i32 23, i32 16, i32 6, i32 1, i32 21, i32 18, i32 4, i32 3, i32 14, i32 15, i32 8, i32 9, i32 10, i32 12, i32 12, i32 9>
+	%R  = mul <8 x i8> %Q1, %Q2
+; CHECK: %Q1.v.r1 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %Q1.v.r2 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK: %R = mul <8 x i8> %Q1.v.r1, %Q1.v.r2
+	ret <8 x i8> %R
+; CHECK: ret <8 x i8> %R
+}
+
+
--- a/tools/bugpoint/CMakeLists.txt
+++ b/tools/bugpoint/CMakeLists.txt
@ -1,5 +1,5 @@
 set(LLVM_LINK_COMPONENTS asmparser instrumentation scalaropts ipo
-  linker bitreader bitwriter)
+  linker bitreader bitwriter vectorize)

 add_llvm_tool(bugpoint
  BugDriver.cpp
--- a/tools/bugpoint/Makefile
+++ b/tools/bugpoint/Makefile
@ -10,6 +10,6 @@
 LEVEL := ../..
 TOOLNAME := bugpoint
 LINK_COMPONENTS := asmparser instrumentation scalaropts ipo linker bitreader \
-                   bitwriter
+                   bitwriter vectorize

 include $(LEVEL)/Makefile.common
--- a/tools/llvm-ld/CMakeLists.txt
+++ b/tools/llvm-ld/CMakeLists.txt
@ -1,4 +1,4 @@
-set(LLVM_LINK_COMPONENTS ipo scalaropts linker archive bitwriter)
+set(LLVM_LINK_COMPONENTS ipo scalaropts linker archive bitwriter vectorize)

 add_llvm_tool(llvm-ld
  Optimize.cpp
--- a/tools/llvm-ld/Makefile
+++ b/tools/llvm-ld/Makefile
@ -9,6 +9,6 @@

 LEVEL := ../..
 TOOLNAME := llvm-ld
-LINK_COMPONENTS := ipo scalaropts linker archive bitwriter
+LINK_COMPONENTS := ipo scalaropts linker archive bitwriter vectorize

 include $(LEVEL)/Makefile.common
--- a/tools/lto/CMakeLists.txt
+++ b/tools/lto/CMakeLists.txt
@ -1,6 +1,6 @@
 set(LLVM_LINK_COMPONENTS
  ${LLVM_TARGETS_TO_BUILD}
-  ipo scalaropts linker bitreader bitwriter mcdisassembler)
+  ipo scalaropts linker bitreader bitwriter mcdisassembler vectorize)

 add_definitions( -DLLVM_VERSION_INFO=\"${PACKAGE_VERSION}\" )

--- a/tools/lto/Makefile
+++ b/tools/lto/Makefile
@ -10,7 +10,7 @@
 LEVEL := ../..
 LIBRARYNAME := LTO
 LINK_COMPONENTS := all-targets ipo scalaropts linker bitreader bitwriter \
-                   mcdisassembler
+                   mcdisassembler vectorize
 LINK_LIBS_IN_SHARED := 1
 SHARED_LIBRARY := 1

--- a/tools/opt/CMakeLists.txt
+++ b/tools/opt/CMakeLists.txt
@ -1,4 +1,4 @@
-set(LLVM_LINK_COMPONENTS bitreader asmparser bitwriter instrumentation scalaropts ipo)
+set(LLVM_LINK_COMPONENTS bitreader asmparser bitwriter instrumentation scalaropts ipo vectorize)

 add_llvm_tool(opt
  AnalysisWrappers.cpp
--- a/tools/opt/Makefile
+++ b/tools/opt/Makefile
@ -9,6 +9,6 @@

 LEVEL := ../..
 TOOLNAME := opt
-LINK_COMPONENTS := bitreader bitwriter asmparser instrumentation scalaropts ipo
+LINK_COMPONENTS := bitreader bitwriter asmparser instrumentation scalaropts ipo vectorize

 include $(LEVEL)/Makefile.common
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@ -480,6 +480,7 @@ int main(int argc, char **argv) {
  PassRegistry &Registry = *PassRegistry::getPassRegistry();
  initializeCore(Registry);
  initializeScalarOpts(Registry);
+  initializeVectorization(Registry);
  initializeIPO(Registry);
  initializeAnalysis(Registry);
  initializeIPA(Registry);