[SLPVectorizer] reorderInputsAccordingToOpcode - remove non-Instruction canonicalization

Remove attempts to commute non-Instructions to the LHS - the codegen changes appear to rely on chance more than anything else and also have a tendency to fight existing instcombine canonicalization which moves constants to the RHS of commutable binary ops. This is prep work towards: (a) reusing reorderInputsAccordingToOpcode for alt-shuffles and removing the similar reorderAltShuffleOperands (b) improving reordering to optimized cases with commutable and non-commutable instructions to still find splat/consecutive ops. Differential Revision: https://reviews.llvm.org/D59738 llvm-svn: 356913
2024-11-25 04:02:41 +01:00 · 2019-03-25 15:53:55 +00:00 · 2019-03-25 15:53:55 +00:00 · b3aca7fab1
commit b3aca7fab1
parent 54848a4db6
45 changed files with 162 additions and 167 deletions
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -2922,13 +2922,8 @@ void BoUpSLP::reorderInputsAccordingToOpcode(const InstructionsState &S,
    // Peel the first iteration out of the loop since there's nothing
    // interesting to do anyway and it simplifies the checks in the loop.
    auto *I = cast<Instruction>(VL[0]);
-    Value *VLeft = I->getOperand(0);
-    Value *VRight = I->getOperand(1);
-    if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
-      // Favor having instruction to the right. FIXME: why?
-      std::swap(VLeft, VRight);
-    Left.push_back(VLeft);
-    Right.push_back(VRight);
+    Left.push_back(I->getOperand(0));
+    Right.push_back(I->getOperand(1));
  }

  // Keep track if we have instructions with all the same opcode on one side.
--- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@ -2246,84 +2246,84 @@ define i32 @disabled(i32* noalias nocapture %a, i32* noalias nocapture readonly
 ; O3DEFAULT-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; O3DEFAULT-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0
 ; O3DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> zeroinitializer
-; O3DEFAULT-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]]
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP1]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP5:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4
 ; O3DEFAULT-NEXT:    [[TMP6:%.*]] = bitcast i32* [[ARRAYIDX_4]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
-; O3DEFAULT-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP7]]
+; O3DEFAULT-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[ARRAYIDX2_4]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 8
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 8
 ; O3DEFAULT-NEXT:    [[TMP10:%.*]] = bitcast i32* [[ARRAYIDX_8]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
-; O3DEFAULT-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP11]]
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP11]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP13:%.*]] = bitcast i32* [[ARRAYIDX2_8]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 12
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 12
 ; O3DEFAULT-NEXT:    [[TMP14:%.*]] = bitcast i32* [[ARRAYIDX_12]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP15:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; O3DEFAULT-NEXT:    [[TMP16:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP15]]
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = add nsw <4 x i32> [[TMP15]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP17:%.*]] = bitcast i32* [[ARRAYIDX2_12]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP16]], <4 x i32>* [[TMP17]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 16
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 16
 ; O3DEFAULT-NEXT:    [[TMP18:%.*]] = bitcast i32* [[ARRAYIDX_16]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP19:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
-; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP19]]
+; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[TMP19]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP21:%.*]] = bitcast i32* [[ARRAYIDX2_16]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP21]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 20
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 20
 ; O3DEFAULT-NEXT:    [[TMP22:%.*]] = bitcast i32* [[ARRAYIDX_20]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP22]], align 4
-; O3DEFAULT-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP23]]
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[TMP23]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP25:%.*]] = bitcast i32* [[ARRAYIDX2_20]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP24]], <4 x i32>* [[TMP25]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 24
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 24
 ; O3DEFAULT-NEXT:    [[TMP26:%.*]] = bitcast i32* [[ARRAYIDX_24]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP27:%.*]] = load <4 x i32>, <4 x i32>* [[TMP26]], align 4
-; O3DEFAULT-NEXT:    [[TMP28:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP27]]
+; O3DEFAULT-NEXT:    [[TMP28:%.*]] = add nsw <4 x i32> [[TMP27]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP29:%.*]] = bitcast i32* [[ARRAYIDX2_24]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP28]], <4 x i32>* [[TMP29]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 28
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_28:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 28
 ; O3DEFAULT-NEXT:    [[TMP30:%.*]] = bitcast i32* [[ARRAYIDX_28]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP31:%.*]] = load <4 x i32>, <4 x i32>* [[TMP30]], align 4
-; O3DEFAULT-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP31]]
+; O3DEFAULT-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[TMP31]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP33:%.*]] = bitcast i32* [[ARRAYIDX2_28]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP32]], <4 x i32>* [[TMP33]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 32
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_32:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 32
 ; O3DEFAULT-NEXT:    [[TMP34:%.*]] = bitcast i32* [[ARRAYIDX_32]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP35:%.*]] = load <4 x i32>, <4 x i32>* [[TMP34]], align 4
-; O3DEFAULT-NEXT:    [[TMP36:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP35]]
+; O3DEFAULT-NEXT:    [[TMP36:%.*]] = add nsw <4 x i32> [[TMP35]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[ARRAYIDX2_32]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP36]], <4 x i32>* [[TMP37]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 36
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_36:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 36
 ; O3DEFAULT-NEXT:    [[TMP38:%.*]] = bitcast i32* [[ARRAYIDX_36]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP39:%.*]] = load <4 x i32>, <4 x i32>* [[TMP38]], align 4
-; O3DEFAULT-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP39]]
+; O3DEFAULT-NEXT:    [[TMP40:%.*]] = add nsw <4 x i32> [[TMP39]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP41:%.*]] = bitcast i32* [[ARRAYIDX2_36]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP40]], <4 x i32>* [[TMP41]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 40
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_40:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 40
 ; O3DEFAULT-NEXT:    [[TMP42:%.*]] = bitcast i32* [[ARRAYIDX_40]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP43:%.*]] = load <4 x i32>, <4 x i32>* [[TMP42]], align 4
-; O3DEFAULT-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP43]]
+; O3DEFAULT-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[TMP43]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP45:%.*]] = bitcast i32* [[ARRAYIDX2_40]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP44]], <4 x i32>* [[TMP45]], align 4
 ; O3DEFAULT-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 44
 ; O3DEFAULT-NEXT:    [[ARRAYIDX2_44:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 44
 ; O3DEFAULT-NEXT:    [[TMP46:%.*]] = bitcast i32* [[ARRAYIDX_44]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    [[TMP47:%.*]] = load <4 x i32>, <4 x i32>* [[TMP46]], align 4
-; O3DEFAULT-NEXT:    [[TMP48:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP47]]
+; O3DEFAULT-NEXT:    [[TMP48:%.*]] = add nsw <4 x i32> [[TMP47]], [[TMP3]]
 ; O3DEFAULT-NEXT:    [[TMP49:%.*]] = bitcast i32* [[ARRAYIDX2_44]] to <4 x i32>*
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP48]], <4 x i32>* [[TMP49]], align 4
 ; O3DEFAULT-NEXT:    [[TMP50:%.*]] = load i32, i32* [[A]], align 4
--- a/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll
@ -66,7 +66,7 @@ define void @test2(<4 x i16> %a, <4 x i16> %b, i64 %c0, i64 %c1, i64 %c2, i64 %c
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[C1:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[C2:%.*]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[C3:%.*]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP4]], [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP0]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load i64, i64* [[GEP0]]
--- a/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@ -21,7 +21,7 @@ define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], <i32 15, i32 15, i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i32> [[TMP5]], <i32 65537, i32 65537, i32 65537, i32 65537>
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], <i32 65535, i32 65535, i32 65535, i32 65535>
-; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP9]])
 ; CHECK-NEXT:    ret i32 [[TMP10]]
--- a/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@ -65,7 +65,7 @@ define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
 ; CHECK-NEXT:    [[T4:%.*]] = shl nsw i32 [[TMP5]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[T4]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP10]]
@ -86,10 +86,10 @@ define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
 ; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[ADD11]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[T12]], i32 1
-; CHECK-NEXT:    [[TMP21]] = add nsw <2 x i32> [[TMP18]], [[TMP20]]
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[ADD11]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
+; CHECK-NEXT:    [[TMP21]] = add nsw <2 x i32> [[TMP19]], [[TMP20]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP22]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
@ -184,7 +184,7 @@ define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
 ; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[T4]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[TMP1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <2 x i32> [[TMP9]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP12]]
@ -194,10 +194,10 @@ define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[ADD11]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> undef, i32 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[T12]], i32 1
-; CHECK-NEXT:    [[TMP18]] = add nsw <2 x i32> [[TMP15]], [[TMP17]]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i32> undef, i32 [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[ADD11]], i32 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <2 x i32> <i32 1, i32 undef>, i32 [[T12]], i32 1
+; CHECK-NEXT:    [[TMP18]] = add nsw <2 x i32> [[TMP16]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP18]], i32 0
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP19]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
--- a/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
+++ b/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
@ -16,8 +16,8 @@ define void @fusion(i8* noalias nocapture align 256 dereferenceable(19267584) %a
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast half* [[TMP11]] to <2 x half>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x half>, <2 x half>* [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x half> <half 0xH5380, half 0xH5380>, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x half> <half 0xH57F0, half 0xH57F0>, [[TMP3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x half> [[TMP2]], <half 0xH5380, half 0xH5380>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd fast <2 x half> [[TMP3]], <half 0xH57F0, half 0xH57F0>
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast half* [[TMP16]] to <2 x half>*
 ; CHECK-NEXT:    store <2 x half> [[TMP4]], <2 x half>* [[TMP5]], align 8
--- a/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
+++ b/test/Transforms/SLPVectorizer/X86/PR35628_2.ll
@ -14,7 +14,7 @@ define void @test() #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[TMP0]], i32 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> <i64 3, i64 2, i64 1, i64 0>, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP4]], <i64 3, i64 2, i64 1, i64 0>
 ; CHECK-NEXT:    [[TMP6]] = extractelement <4 x i64> [[TMP5]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[DUMMY_SHL:%.*]] = shl i64 [[TMP7]], 32
--- a/test/Transforms/SLPVectorizer/X86/PR35777.ll
+++ b/test/Transforms/SLPVectorizer/X86/PR35777.ll
@ -10,7 +10,7 @@ define { i64, i64 } @patatino(double %arg) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 2) to <2 x double>*), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> undef, double [[ARG:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[ARG]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP0]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x double>, <2 x double>* bitcast (double* getelementptr inbounds ([6 x double], [6 x double]* @global, i64 0, i64 4) to <2 x double>*), align 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], [[TMP5]]
--- a/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/test/Transforms/SLPVectorizer/X86/PR39774.ll
@ -10,7 +10,7 @@ define void @Test(i32) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>, [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
 ; CHECK-NEXT:    [[VAL_1:%.*]] = and i32 [[TMP2]], undef
 ; CHECK-NEXT:    [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]]
 ; CHECK-NEXT:    [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]]
@ -100,7 +100,7 @@ define void @Test(i32) {
 ; FORCE_REDUCTION-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ]
 ; FORCE_REDUCTION-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
 ; FORCE_REDUCTION-NEXT:    [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1
-; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> <i32 0, i32 55, i32 285, i32 1240>, [[SHUFFLE]]
+; FORCE_REDUCTION-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], <i32 0, i32 55, i32 285, i32 1240>
 ; FORCE_REDUCTION-NEXT:    [[VAL_1:%.*]] = and i32 [[TMP2]], undef
 ; FORCE_REDUCTION-NEXT:    [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]]
 ; FORCE_REDUCTION-NEXT:    [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]]
--- a/test/Transforms/SLPVectorizer/X86/PR40310.ll
+++ b/test/Transforms/SLPVectorizer/X86/PR40310.ll
@ -12,7 +12,7 @@ define void @mainTest(i32 %param, i32 * %vals, i32 %len) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15
 ; CHECK-NEXT:    store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 -1>, [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <16 x i32> [[SHUFFLE]], <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 -1>
 ; CHECK-NEXT:    [[V14:%.*]] = and i32 [[TMP2]], undef
 ; CHECK-NEXT:    [[V16:%.*]] = and i32 undef, [[V14]]
 ; CHECK-NEXT:    [[V18:%.*]] = and i32 undef, [[V16]]
--- a/test/Transforms/SLPVectorizer/X86/barriercall.ll
+++ b/test/Transforms/SLPVectorizer/X86/barriercall.ll
@ -15,7 +15,7 @@ define i32 @foo(i32* nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], <i32 5, i32 9, i32 3, i32 10>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP3]], <i32 5, i32 9, i32 3, i32 10>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> <i32 9, i32 9, i32 9, i32 9>, [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 9, i32 9, i32 9, i32 9>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    ret i32 undef
--- a/test/Transforms/SLPVectorizer/X86/commutativity.ll
+++ b/test/Transforms/SLPVectorizer/X86/commutativity.ll
@ -96,7 +96,7 @@ define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[B:%.*]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[C]], i32 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[A]], i32 3
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP12]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP9]], [[TMP12]]
 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16
 ; CHECK-NEXT:    ret void
 ;
--- a/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
+++ b/test/Transforms/SLPVectorizer/X86/compare-reduce.ll
@ -20,8 +20,8 @@ define void @reduce_compare(double* nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> <double 7.000000e+00, double 4.000000e+00>, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> <double 5.000000e+00, double 9.000000e+00>, [[TMP6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], <double 7.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], <double 5.000000e+00, double 9.000000e+00>
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
 ; CHECK-NEXT:    [[CMP11:%.*]] = fcmp ogt double [[TMP8]], [[TMP9]]
--- a/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
+++ b/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll
@ -68,12 +68,12 @@ define void @testfunc(float* nocapture %dest, float* nocapture readonly %src) {
 ; AVX-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
 ; AVX-NEXT:    [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1
 ; AVX-NEXT:    [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]]
-; AVX-NEXT:    [[TMP9:%.*]] = fmul <2 x float> zeroinitializer, [[TMP0]]
+; AVX-NEXT:    [[TMP9:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer
 ; AVX-NEXT:    [[TMP10:%.*]] = fadd <2 x float> [[TMP9]], [[TMP8]]
 ; AVX-NEXT:    [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], <float 1.000000e+00, float 1.000000e+00>
 ; AVX-NEXT:    [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> <float 1.000000e+00, float 1.000000e+00>
 ; AVX-NEXT:    [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], <float -1.000000e+00, float -1.000000e+00>
-; AVX-NEXT:    [[TMP14:%.*]] = fmul <2 x float> zeroinitializer, [[TMP12]]
+; AVX-NEXT:    [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer
 ; AVX-NEXT:    [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> <float -0.000000e+00, float -0.000000e+00>, <2 x float> [[TMP14]]
 ; AVX-NEXT:    [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0
 ; AVX-NEXT:    [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1
--- a/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
+++ b/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll
@ -99,7 +99,7 @@ define void @zot(%struct.hoge* %arg) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], undef
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[ARG:%.*]], i64 0, i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> undef, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], undef
 ; CHECK-NEXT:    [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast double* [[TMP7]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8
--- a/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
+++ b/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll
@ -31,10 +31,10 @@ define void @main() #0 {
 ; CHECK:       cond.false66.us:
 ; CHECK-NEXT:    [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, undef
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[ADD_I276_US]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double 0xBFA5CC2D1960285F, i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> <double 0.000000e+00, double undef>, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> <double 1.400000e+02, double 1.400000e+02>, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> <double 5.000000e+01, double 5.200000e+01>, [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double undef, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 0.000000e+00, double 0xBFA5CC2D1960285F>
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], <double 1.400000e+02, double 1.400000e+02>
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], <double 5.000000e+01, double 5.200000e+01>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> undef, [[TMP2]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], <2 x double>* [[TMP6]], align 8
--- a/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll
+++ b/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll
@ -22,7 +22,7 @@ define i32 @foo(double* nocapture %A, float* nocapture %B, i32 %g) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> <float 5.000000e+00, float 8.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], <float 5.000000e+00, float 8.000000e+00>
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[G:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
--- a/test/Transforms/SLPVectorizer/X86/cse.ll
+++ b/test/Transforms/SLPVectorizer/X86/cse.ll
@ -18,20 +18,21 @@ define i32 @test(double* nocapture %G) {
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[G]], i64 6
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> <double 4.000000e+00, double 3.000000e+00>, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> <double 1.000000e+00, double 6.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 4.000000e+00, double 3.000000e+00>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+00, double 6.000000e+00>
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
-; CHECK-NEXT:    [[ADD8:%.*]] = fadd double [[TMP5]], 7.000000e+00
 ; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2
-; CHECK-NEXT:    store double [[ADD8]], double* [[ARRAYIDX9]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
 ; CHECK-NEXT:    [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00
-; CHECK-NEXT:    [[ADD12:%.*]] = fadd double [[MUL11]], 8.000000e+00
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], <double 7.000000e+00, double 8.000000e+00>
 ; CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
-; CHECK-NEXT:    store double [[ADD12]], double* [[ARRAYIDX13]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>*
+; CHECK-NEXT:    store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8
 ; CHECK-NEXT:    ret i32 undef
 ;
 entry:
@ -72,13 +73,13 @@ define i32 @foo(double* nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to <4 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> <double 7.900000e+00, double 7.700000e+00, double 7.600000e+00, double 7.400000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], <double 7.900000e+00, double 7.700000e+00, double 7.600000e+00, double 7.400000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> undef, double [[CONV]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[CONV]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[CONV]], i32 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[CONV]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> <double 6.000000e+00, double 2.000000e+00, double 3.000000e+00, double 4.000000e+00>, [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> [[TMP7]], <double 6.000000e+00, double 2.000000e+00, double 3.000000e+00, double 4.000000e+00>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[A]] to <4 x double>*
 ; CHECK-NEXT:    store <4 x double> [[TMP8]], <4 x double>* [[TMP9]], align 8
 ; CHECK-NEXT:    ret i32 undef
@ -135,7 +136,7 @@ define i32 @test2(double* nocapture %G, i32 %k) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul double [[TMP7]], 3.000000e+00
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x double> undef, double [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <2 x double> [[TMP9]], double [[TMP8]], i32 1
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> <double 1.000000e+00, double 6.000000e+00>, [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 1.000000e+00, double 6.000000e+00>
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, double* [[G]], i64 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast double* [[G]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP13]], align 8
@ -146,7 +147,7 @@ define i32 @test2(double* nocapture %G, i32 %k) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = fmul double [[TMP17]], 3.000000e+00
 ; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x double> undef, double [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TMP18]], i32 1
-; CHECK-NEXT:    [[TMP21:%.*]] = fadd <2 x double> <double 7.000000e+00, double 8.000000e+00>, [[TMP20]]
+; CHECK-NEXT:    [[TMP21:%.*]] = fadd <2 x double> [[TMP20]], <double 7.000000e+00, double 8.000000e+00>
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, double* [[G]], i64 3
 ; CHECK-NEXT:    [[TMP23:%.*]] = bitcast double* [[TMP15]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP21]], <2 x double>* [[TMP23]], align 8
@ -203,13 +204,13 @@ define i32 @foo4(double* nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds double, double* [[A]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[A]] to <4 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[TMP0]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> <double 7.900000e+00, double 7.900000e+00, double 7.900000e+00, double 7.900000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], <double 7.900000e+00, double 7.900000e+00, double 7.900000e+00, double 7.900000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> undef, double [[CONV]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[CONV]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[CONV]], i32 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x double> [[TMP5]], double [[CONV]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x double> [[TMP6]], [[TMP2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> <double 6.000000e+00, double 6.000000e+00, double 6.000000e+00, double 6.000000e+00>, [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = fadd <4 x double> [[TMP7]], <double 6.000000e+00, double 6.000000e+00, double 6.000000e+00, double 6.000000e+00>
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast double* [[A]] to <4 x double>*
 ; CHECK-NEXT:    store <4 x double> [[TMP8]], <4 x double>* [[TMP9]], align 8
 ; CHECK-NEXT:    ret i32 undef
--- a/test/Transforms/SLPVectorizer/X86/cycle_dup.ll
+++ b/test/Transforms/SLPVectorizer/X86/cycle_dup.ll
@ -24,7 +24,7 @@ define i32 @foo(i32* nocapture %A) #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = phi <4 x i32> [ [[TMP4:%.*]], [[FOR_BODY]] ], [ [[TMP1]], [[ENTRY]] ]
-; CHECK-NEXT:    [[TMP4]] = mul nsw <4 x i32> <i32 18, i32 19, i32 12, i32 9>, [[TMP3]]
+; CHECK-NEXT:    [[TMP4]] = mul nsw <4 x i32> [[TMP3]], <i32 18, i32 19, i32 12, i32 9>
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_029]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]]
--- a/test/Transforms/SLPVectorizer/X86/external_user.ll
+++ b/test/Transforms/SLPVectorizer/X86/external_user.ll
@ -32,9 +32,9 @@ define double @ext_user(double* noalias nocapture %B, double* noalias nocapture
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_020:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> <double 1.000000e+01, double 1.000000e+01>, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> <double 4.000000e+00, double 4.000000e+00>, [[TMP3]]
-; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> <double 4.000000e+00, double 4.000000e+00>, [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+01, double 1.000000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], <double 4.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> [[TMP4]], <double 4.000000e+00, double 4.000000e+00>
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_020]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
--- a/test/Transforms/SLPVectorizer/X86/extract.ll
+++ b/test/Transforms/SLPVectorizer/X86/extract.ll
@ -8,7 +8,7 @@ define void @fextr(double* %ptr) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[LD:%.*]] = load <2 x double>, <2 x double>* undef
 ; CHECK-NEXT:    [[P0:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> <double 0.000000e+00, double 1.100000e+00>, [[LD]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[LD]], <double 0.000000e+00, double 1.100000e+00>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
@ -32,7 +32,7 @@ define void @fextr1(double* %ptr) {
 ; CHECK-NEXT:    [[LD:%.*]] = load <2 x double>, <2 x double>* undef
 ; CHECK-NEXT:    [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x double> [[LD]], <2 x double> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0
-; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> <double 3.400000e+00, double 1.200000e+00>, [[REORDER_SHUFFLE]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[REORDER_SHUFFLE]], <double 3.400000e+00, double 1.200000e+00>
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P1]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
@ -59,7 +59,7 @@ define void @fextr2(double* %ptr) {
 ; CHECK-NEXT:    [[P0:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> <double 5.500000e+00, double 6.600000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], <double 5.500000e+00, double 6.600000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[P0]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
--- a/test/Transforms/SLPVectorizer/X86/extractcost.ll
+++ b/test/Transforms/SLPVectorizer/X86/extractcost.ll
@ -14,7 +14,7 @@ define i32 @foo(i32* nocapture %A, i32 %n, i32 %m) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], <i32 5, i32 9, i32 3, i32 10>
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i32> [[TMP3]], <i32 5, i32 9, i32 3, i32 10>
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> <i32 9, i32 9, i32 9, i32 9>, [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], <i32 9, i32 9, i32 9, i32 9>
 ; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP7]], i32 0
--- a/test/Transforms/SLPVectorizer/X86/hoist.ll
+++ b/test/Transforms/SLPVectorizer/X86/hoist.ll
@ -25,7 +25,7 @@ define i32 @foo(i32* nocapture %A, i32 %n, i32 %k) {
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_024]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[SHUFFLE]], [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[SHUFFLE]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32* [[ARRAYIDX]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
 ; CHECK-NEXT:    [[ADD10]] = add nsw i32 [[I_024]], 4
--- a/test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ b/test/Transforms/SLPVectorizer/X86/horizontal.ll
@ -36,7 +36,7 @@ define i32 @add_red(float* %A, i32 %n) {
 ; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1330]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
 ; CHECK-NEXT:    [[ADD6:%.*]] = fadd fast float undef, undef
 ; CHECK-NEXT:    [[ADD11:%.*]] = fadd fast float [[ADD6]], undef
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
@ -76,7 +76,7 @@ define i32 @add_red(float* %A, i32 %n) {
 ; STORE-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[ADD1330]]
 ; STORE-NEXT:    [[TMP1:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>*
 ; STORE-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; STORE-NEXT:    [[TMP3:%.*]] = fmul <4 x float> <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>, [[TMP2]]
+; STORE-NEXT:    [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
 ; STORE-NEXT:    [[ADD6:%.*]] = fadd fast float undef, undef
 ; STORE-NEXT:    [[ADD11:%.*]] = fadd fast float [[ADD6]], undef
 ; STORE-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
--- a/test/Transforms/SLPVectorizer/X86/in-tree-user.ll
+++ b/test/Transforms/SLPVectorizer/X86/in-tree-user.ll
@ -21,8 +21,8 @@ define void @in_tree_user(double* nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = fmul <2 x double> [[TMP1]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> <double 7.000000e+00, double 4.000000e+00>, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> <double 5.000000e+00, double 9.000000e+00>, [[TMP6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], <double 7.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], <double 5.000000e+00, double 9.000000e+00>
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 0
 ; CHECK-NEXT:    [[INTREEUSER:%.*]] = fadd double [[TMP8]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 1
--- a/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
+++ b/test/Transforms/SLPVectorizer/X86/insert-after-bundle.ll
@ -132,7 +132,7 @@ define void @bar(i8* noalias nocapture readonly %a, i8* noalias nocapture readon
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp ult <16 x i8> [[TMP17]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = select <16 x i1> [[TMP24]], <16 x i8> [[TMP23]], <16 x i8> [[TMP21]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = zext <16 x i8> [[TMP25]] to <16 x i32>
-; CHECK-NEXT:    [[TMP27:%.*]] = mul <16 x i32> [[TMP15]], [[TMP26]]
+; CHECK-NEXT:    [[TMP27:%.*]] = mul <16 x i32> [[TMP26]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = trunc <16 x i32> [[TMP27]] to <16 x i8>
 ; CHECK-NEXT:    [[ARRAYIDX188:%.*]] = getelementptr inbounds i8, i8* [[E_ADDR_0354]], i64 15
 ; CHECK-NEXT:    [[TMP29:%.*]] = bitcast i8* [[E_ADDR_0354]] to <16 x i8>*
@ -413,52 +413,52 @@ define i32 @foo1() local_unnamed_addr #0 {
 ; CHECK-LABEL: @foo1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([64 x i32]* @ib to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i32> [[TMP0]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* bitcast ([64 x i32]* @ia to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 4) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i32> [[TMP2]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 4) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 8) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 8) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 12) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor <4 x i32> [[TMP6]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 12) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 16) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP9]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 16) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 20) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP11:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = xor <4 x i32> [[TMP10]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 20) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 24) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = xor <4 x i32> [[TMP12]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP13]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 24) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP14:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 28) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = xor <4 x i32> [[TMP14]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP15]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 28) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP16:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 32) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP17:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = xor <4 x i32> [[TMP16]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP17]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 32) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 36) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP19:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP18]]
+; CHECK-NEXT:    [[TMP19:%.*]] = xor <4 x i32> [[TMP18]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP19]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 36) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP20:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 40) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP21:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP20]]
+; CHECK-NEXT:    [[TMP21:%.*]] = xor <4 x i32> [[TMP20]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP21]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 40) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP22:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 44) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP23:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP22]]
+; CHECK-NEXT:    [[TMP23:%.*]] = xor <4 x i32> [[TMP22]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP23]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 44) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP24:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 48) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP25:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP24]]
+; CHECK-NEXT:    [[TMP25:%.*]] = xor <4 x i32> [[TMP24]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP25]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 48) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP26:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 52) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP27:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP26]]
+; CHECK-NEXT:    [[TMP27:%.*]] = xor <4 x i32> [[TMP26]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP27]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 52) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP28:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 56) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP28]]
+; CHECK-NEXT:    [[TMP29:%.*]] = xor <4 x i32> [[TMP28]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP29]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 56) to <4 x i32>*), align 16
 ; CHECK-NEXT:    [[TMP30:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ib, i64 0, i64 60) to <4 x i32>*), align 16
-; CHECK-NEXT:    [[TMP31:%.*]] = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, [[TMP30]]
+; CHECK-NEXT:    [[TMP31:%.*]] = xor <4 x i32> [[TMP30]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP31]], <4 x i32>* bitcast (i32* getelementptr inbounds ([64 x i32], [64 x i32]* @ia, i64 0, i64 60) to <4 x i32>*), align 16
 ; CHECK-NEXT:    br label [[FOR_BODY5:%.*]]
 ; CHECK:       for.cond3:
--- a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+++ b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
@ -627,7 +627,7 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[Y:%.*]], i32 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[Z:%.*]], i32 3
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
-; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fmul <4 x double> [[TMP4]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
 ; CHECK-NEXT:    [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP6]], i32 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP5]], i32 1
@ -645,7 +645,7 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
 ; ZEROTHRESH-NEXT:    [[TMP2:%.*]] = insertelement <4 x double> [[TMP1]], double [[Y:%.*]], i32 2
 ; ZEROTHRESH-NEXT:    [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[Z:%.*]], i32 3
 ; ZEROTHRESH-NEXT:    [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], <double 0.000000e+00, double 1.000000e+00, double 2.000000e+00, double 3.000000e+00>
-; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = fmul <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[TMP4]]
+; ZEROTHRESH-NEXT:    [[TMP5:%.*]] = fmul <4 x double> [[TMP4]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
 ; ZEROTHRESH-NEXT:    [[TMP6:%.*]] = extractelement <4 x double> [[TMP5]], i32 0
 ; ZEROTHRESH-NEXT:    [[I1:%.*]] = insertelement <4 x double> undef, double [[TMP6]], i32 3
 ; ZEROTHRESH-NEXT:    [[TMP7:%.*]] = extractelement <4 x double> [[TMP5]], i32 1
--- a/test/Transforms/SLPVectorizer/X86/long_chains.ll
+++ b/test/Transforms/SLPVectorizer/X86/long_chains.ll
@ -11,22 +11,22 @@ define i32 @test(double* nocapture %A, i8* nocapture %B) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[B:%.*]] to <2 x i8>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, <2 x i8>* [[TMP0]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i8> <i8 3, i8 3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i8> [[TMP1]], <i8 3, i8 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i8> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i8> undef, i8 [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i8> [[TMP4]], i8 [[TMP5]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = sitofp <2 x i8> [[TMP6]] to <2 x double>
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], <double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul <2 x double> [[TMP11]], [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = fadd <2 x double> [[TMP12]], <double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP14:%.*]] = fmul <2 x double> [[TMP13]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = fadd <2 x double> [[TMP14]], <double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP16:%.*]] = fmul <2 x double> [[TMP15]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = fadd <2 x double> <double 1.000000e+00, double 1.000000e+00>, [[TMP16]]
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd <2 x double> [[TMP16]], <double 1.000000e+00, double 1.000000e+00>
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast double* [[A:%.*]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP17]], <2 x double>* [[TMP18]], align 8
 ; CHECK-NEXT:    ret i32 undef
--- a/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
+++ b/test/Transforms/SLPVectorizer/X86/loopinvariant.ll
@ -36,7 +36,7 @@ define i32 @foo(i32* nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[N]], i32 5
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <8 x i32> [[TMP14]], i32 [[N]], i32 6
 ; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x i32> [[TMP15]], i32 [[N]], i32 7
-; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <8 x i32> [[TMP16]], [[TMP8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = add nsw <8 x i32> [[TMP8]], [[TMP16]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i32* [[ARRAYIDX]] to <8 x i32>*
 ; CHECK-NEXT:    store <8 x i32> [[TMP17]], <8 x i32>* [[TMP18]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 8
--- a/test/Transforms/SLPVectorizer/X86/multi_block.ll
+++ b/test/Transforms/SLPVectorizer/X86/multi_block.ll
@ -26,10 +26,10 @@ define i32 @bar(double* nocapture %A, i32 %d) {
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[TMP7:%.*]], label [[TMP5:%.*]]
 ; CHECK:         [[TMP6:%.*]] = tail call i32 (...) @foo()
 ; CHECK-NEXT:    br label [[TMP7]]
-; CHECK:         [[TMP8:%.*]] = fadd <2 x float> <float 4.000000e+00, float 5.000000e+00>, [[TMP3]]
+; CHECK:         [[TMP8:%.*]] = fadd <2 x float> [[TMP3]], <float 4.000000e+00, float 5.000000e+00>
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, double* [[A]], i64 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double>
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> <double 9.000000e+00, double 5.000000e+00>, [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], <double 9.000000e+00, double 5.000000e+00>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast double* [[TMP9]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP11]], <2 x double>* [[TMP12]], align 8
 ; CHECK-NEXT:    ret i32 undef
--- a/test/Transforms/SLPVectorizer/X86/multi_user.ll
+++ b/test/Transforms/SLPVectorizer/X86/multi_user.ll
@ -19,7 +19,7 @@ define i32 @foo(i32* nocapture %A, i32 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP1]], i32 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 3
-; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> <i32 7, i32 8, i32 9, i32 10>, [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], <i32 7, i32 8, i32 9, i32 10>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP8]]
--- a/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/test/Transforms/SLPVectorizer/X86/operandorder.ll
@ -14,7 +14,7 @@ define void @shuffle_operands1(double * noalias %from, double * noalias %to,
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x double> undef, double [[V1:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[V2:%.*]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[TO:%.*]] to <2 x double>*
 ; CHECK-NEXT:    store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 4
 ; CHECK-NEXT:    ret void
--- a/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/test/Transforms/SLPVectorizer/X86/phi.ll
@ -81,9 +81,9 @@ define i32 @foo2(double* noalias nocapture %B, double* noalias nocapture %A, i32
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_019:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> <double 1.000000e+01, double 1.000000e+01>, [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> <double 4.000000e+00, double 4.000000e+00>, [[TMP3]]
-; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> <double 4.000000e+00, double 4.000000e+00>, [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], <double 1.000000e+01, double 1.000000e+01>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], <double 4.000000e+00, double 4.000000e+00>
+; CHECK-NEXT:    [[TMP5]] = fadd <2 x double> [[TMP4]], <double 4.000000e+00, double 4.000000e+00>
 ; CHECK-NEXT:    [[INC]] = add nsw i32 [[I_019]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
@ -150,9 +150,9 @@ define float @foo3(float* nocapture readonly %A) #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP14:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = phi <4 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00
 ; CHECK-NEXT:    [[ADD6]] = fadd float [[R_052]], [[MUL]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2
@ -163,27 +163,26 @@ define float @foo3(float* nocapture readonly %A) #0 {
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>*
 ; CHECK-NEXT:    [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4
 ; CHECK-NEXT:    [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> <float 1.100000e+01, float 1.000000e+01, float 9.000000e+00, float undef>, float [[TMP4]], i32 3
-; CHECK-NEXT:    [[TMP12]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x float> undef, float [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP14]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 1
-; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float 8.000000e+00, i32 3
-; CHECK-NEXT:    [[TMP18:%.*]] = fmul <4 x float> [[TMP11]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19]] = fadd <4 x float> [[TMP6]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP20]], 121
+; CHECK-NEXT:    [[TMP11]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x float> undef, float [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP13]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP13]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP8]], i32 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <4 x float> [[TMP16]], <float 1.100000e+01, float 1.000000e+01, float 9.000000e+00, float 8.000000e+00>
+; CHECK-NEXT:    [[TMP18]] = fadd <4 x float> [[TMP6]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP19]], i32 3
-; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP19]], i32 2
-; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP22]]
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP19]], i32 1
-; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP23]]
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x float> [[TMP19]], i32 0
-; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP24]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x float> [[TMP18]], i32 3
+; CHECK-NEXT:    [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP20]]
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <4 x float> [[TMP18]], i32 2
+; CHECK-NEXT:    [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x float> [[TMP18]], i32 1
+; CHECK-NEXT:    [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP22]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <4 x float> [[TMP18]], i32 0
+; CHECK-NEXT:    [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP23]]
 ; CHECK-NEXT:    ret float [[ADD31]]
 ;
 entry:
@ -255,7 +254,7 @@ define float @sort_phi_type(float* nocapture readonly %A) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP5]], i32 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 3
-; CHECK-NEXT:    [[TMP9]] = fmul <4 x float> <float 8.000000e+00, float 9.000000e+00, float 1.000000e+02, float 1.110000e+02>, [[TMP8]]
+; CHECK-NEXT:    [[TMP9]] = fmul <4 x float> [[TMP8]], <float 8.000000e+00, float 9.000000e+00, float 1.000000e+02, float 1.110000e+02>
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
--- a/test/Transforms/SLPVectorizer/X86/pr35497.ll
+++ b/test/Transforms/SLPVectorizer/X86/pr35497.ll
@ -55,7 +55,7 @@ define void @pr35497() local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> <i64 20, i64 20>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = and <2 x i64> [[TMP2]], <i64 20, i64 20>
 ; CHECK-NEXT:    [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer
 ; CHECK-NEXT:    [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1
@ -63,7 +63,7 @@ define void @pr35497() local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> undef, i64 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], <i64 2, i64 2>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i64> <i64 20, i64 20>, [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP8]], <i64 20, i64 20>
 ; CHECK-NEXT:    [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>*
 ; CHECK-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1
--- a/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll
+++ b/test/Transforms/SLPVectorizer/X86/propagate_ir_flags.ll
@ -88,7 +88,7 @@ define void @nsw(i32* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
@ -124,7 +124,7 @@ define void @not_nsw(i32* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
@ -160,7 +160,7 @@ define void @nuw(i32* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add nuw <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add nuw <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
@ -196,7 +196,7 @@ define void @not_nuw(i32* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i32> [[TMP2]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[IDX1]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
@ -232,7 +232,7 @@ define void @nnan(float* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd nnan <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd nnan <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
@ -268,7 +268,7 @@ define void @not_nnan(float* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
@ -304,7 +304,7 @@ define void @only_fast(float* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
@ -340,7 +340,7 @@ define void @only_arcp(float* %x) {
 ; CHECK-NEXT:    [[IDX4:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd arcp <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd arcp <4 x float> [[TMP2]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[IDX1]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP3]], <4 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
--- a/test/Transforms/SLPVectorizer/X86/reduction.ll
+++ b/test/Transforms/SLPVectorizer/X86/reduction.ll
@ -23,7 +23,7 @@ define i32 @reduce(double* nocapture %A, i32 %n, i32 %m) {
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i32 [[MUL]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> <double 7.000000e+00, double 7.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 7.000000e+00, double 7.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
 ; CHECK-NEXT:    [[ADD5:%.*]] = fadd double [[TMP3]], [[TMP4]]
--- a/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
+++ b/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
@ -14,10 +14,10 @@ define i32 @test(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 undef, [[SUM]]
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 undef, [[ADD]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 undef, [[ADD_1]]
@ -32,11 +32,11 @@ define i32 @test(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP3]], [[SUM]]
+; CHECK-NEXT:    [[OP_EXTRA]] = add i32 [[TMP3]], [[SUM]]
 ; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 undef, [[ADD_6]]
 ; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 [[BIN_EXTRA]]
+; CHECK-NEXT:    ret i32 [[OP_EXTRA]]
 ;
 entry:
  %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
--- a/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
+++ b/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll
@ -18,7 +18,7 @@ define void @hoge() {
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <2 x i32> <i32 63, i32 undef>, [[REORDER_SHUFFLE]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef
 ; CHECK-NEXT:    [[SHUFFLE8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> <i32 undef, i32 15, i32 31, i32 47>, [[SHUFFLE8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE8]], <i32 undef, i32 15, i32 31, i32 47>
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt i32 undef, undef
 ; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i32 undef, i32 undef
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], undef
@ -37,7 +37,7 @@ define void @hoge() {
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <2 x i32> undef, [[TMP2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = sub <2 x i32> [[TMP7]], undef
 ; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
-; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> <i32 -49, i32 -33, i32 -33, i32 -17>, [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[SHUFFLE]], <i32 -49, i32 -33, i32 -33, i32 -17>
 ; CHECK-NEXT:    [[TMP26:%.*]] = icmp sgt i32 undef, undef
 ; CHECK-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i32 undef, i32 undef
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp sgt i32 [[TMP27]], undef
--- a/test/Transforms/SLPVectorizer/X86/resched.ll
+++ b/test/Transforms/SLPVectorizer/X86/resched.ll
@ -72,7 +72,7 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv()
 ; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14
 ; CHECK-NEXT:    [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15
 ; CHECK-NEXT:    [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8>
-; CHECK-NEXT:    [[TMP44:%.*]] = and <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, [[TMP43]]
+; CHECK-NEXT:    [[TMP44:%.*]] = and <16 x i8> [[TMP43]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
 ; CHECK-NEXT:    [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15
 ; CHECK-NEXT:    [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 ; CHECK-NEXT:    store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1
--- a/test/Transforms/SLPVectorizer/X86/saxpy.ll
+++ b/test/Transforms/SLPVectorizer/X86/saxpy.ll
@ -15,7 +15,7 @@ define void @SAXPY(i32* noalias nocapture %x, i32* noalias nocapture %y, i32 %a,
 ; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[A]], i32 2
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[A]], i32 3
-; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP8]], [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[TMP9]], [[TMP11]]
--- a/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
+++ b/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll
@ -12,7 +12,7 @@ define i32 @slp_schedule_bundle() local_unnamed_addr #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([1 x i32]* @b to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], <i32 31, i32 31, i32 31, i32 31>
-; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> <i32 1, i32 1, i32 1, i32 1>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4
 ; CHECK-NEXT:    [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31
--- a/test/Transforms/SLPVectorizer/X86/simple-loop.ll
+++ b/test/Transforms/SLPVectorizer/X86/simple-loop.ll
@ -14,8 +14,8 @@ define i32 @rollable(i32* noalias nocapture %in, i32* noalias nocapture %out, i6
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i32> <i32 7, i32 7, i32 7, i32 7>, [[TMP5]]
-; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> <i32 7, i32 14, i32 21, i32 28>, [[TMP6]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i32> [[TMP5]], <i32 7, i32 7, i32 7, i32 7>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[TMP6]], <i32 7, i32 14, i32 21, i32 28>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP9]], align 4
--- a/test/Transforms/SLPVectorizer/X86/value-bug.ll
+++ b/test/Transforms/SLPVectorizer/X86/value-bug.ll
@ -33,9 +33,9 @@ define void @test() {
 ; CHECK-NEXT:    br i1 undef, label [[BB32_I]], label [[BB21_I]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP9:%.*]] = fpext <2 x float> [[TMP3]] to <2 x double>
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> <double undef, double 0.000000e+00>, [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <2 x double> [[TMP9]], <double undef, double 0.000000e+00>
 ; CHECK-NEXT:    [[TMP11:%.*]] = fadd <2 x double> undef, [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> undef, [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = fadd <2 x double> [[TMP11]], undef
 ; CHECK-NEXT:    [[TMP13]] = fptrunc <2 x double> [[TMP12]] to <2 x float>
 ; CHECK-NEXT:    br label [[BB283]]
 ;
--- a/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
+++ b/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll
@ -12,7 +12,7 @@ define void @add0(i32* noalias %dst, i32* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 1, i32 1, i32 2, i32 3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 1, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@ -136,7 +136,7 @@ define void @sub1(i32* noalias %dst, i32* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 4, i32 -1, i32 -2, i32 -3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 4, i32 -1, i32 -2, i32 -3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@ -174,7 +174,7 @@ define void @sub2(i32* noalias %dst, i32* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> <i32 -1, i32 -1, i32 -2, i32 -3>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], <i32 -1, i32 -1, i32 -2, i32 -3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>*
 ; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@ -422,7 +422,7 @@ define void @add0f(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@ -546,7 +546,7 @@ define void @sub1f(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@ -584,7 +584,7 @@ define void @sub2f(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@ -751,7 +751,7 @@ define void @add0fn(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float 1.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@ -875,7 +875,7 @@ define void @sub1fn(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float 4.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
@ -913,7 +913,7 @@ define void @sub2fn(float* noalias %dst, float* noalias %src) {
 ; CHECK-NEXT:    [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[TMP1]], <float -1.000000e+00, float -1.000000e+00, float -2.000000e+00, float -3.000000e+00>
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>*
 ; CHECK-NEXT:    store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4
 ; CHECK-NEXT:    ret void
--- a/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
+++ b/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll
@ -17,7 +17,7 @@ define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[TMP9]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 undef, undef
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 undef, i32 undef
 ; CHECK-NEXT:    [[CMP15:%.*]] = icmp ult i32 [[COND]], undef
@ -91,7 +91,7 @@ define i32 @foo1(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[TMP9]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 undef, undef
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 undef, i32 undef
 ; CHECK-NEXT:    [[CMP15:%.*]] = icmp ult i32 [[COND]], undef
@ -169,7 +169,7 @@ define i32 @foo2(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7
-; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[TMP9]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 undef, undef
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP]], i32 undef, i32 undef
 ; CHECK-NEXT:    [[CMP15:%.*]] = icmp ult i32 [[COND]], undef