[DAG] Refactor DAGCombiner::ReassociateOps

Summary: Extract the logic for doing reassociations from DAGCombiner::reassociateOps into a helper function DAGCombiner::reassociateOpsCommutative, and use that helper to trigger reassociation on the original operand order, or the commuted operand order. Codegen is not identical since the operand order will be different when doing the reassociations for the commuted case. That causes some unfortunate churn in some test cases. Apart from that this should be NFC. Reviewers: spatel, craig.topper, tstellar Reviewed By: spatel Subscribers: dmgreen, dschuff, jvesely, nhaehnle, javed.absar, sbc100, jgravelle-google, hiraditya, aheejin, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D61199 llvm-svn: 359476
2025-01-31 12:41:49 +01:00 · 2019-04-29 17:50:10 +00:00 · 2019-04-29 17:50:10 +00:00 · 506a5c80b2
commit 506a5c80b2
parent 751032ce89
19 changed files with 151 additions and 151 deletions
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -458,7 +458,9 @@ namespace {
    SDValue visitFMULForFMADistributiveCombine(SDNode *N);

    SDValue XformToShuffleWithZero(SDNode *N);
-    SDValue ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
+    SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0,
+                                      SDValue N1);
+    SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
                           SDValue N1, SDNodeFlags Flags);

    SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt);
@ -1000,53 +1002,50 @@ static bool isAnyConstantBuildVector(SDValue V, bool NoOpaques = false) {
         ISD::isBuildVectorOfConstantFPSDNodes(V.getNode());
 }

-SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
+// Helper for DAGCombiner::reassociateOps. Try to reassociate an expression
+// such as (Opc N0, N1), if \p N0 is the same kind of operation as \p Opc.
+SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
+                                               SDValue N0, SDValue N1) {
+  EVT VT = N0.getValueType();
+
+  if (N0.getOpcode() != Opc)
+    return SDValue();
+
+  // Don't reassociate reductions.
+  if (N0->getFlags().hasVectorReduction())
+    return SDValue();
+
+  if (SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
+    if (SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
+      // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
+      if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, C1, C2))
+        return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
+      return SDValue();
+    }
+    if (N0.hasOneUse()) {
+      // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
+      //              iff (op x, c1) has one use
+      SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1);
+      if (!OpNode.getNode())
+        return SDValue();
+      AddToWorklist(OpNode.getNode());
+      return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
+    }
+  }
+  return SDValue();
+}
+
+// Try to reassociate commutative binops.
+SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
                                    SDValue N1, SDNodeFlags Flags) {
+  assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative.");
  // Don't reassociate reductions.
  if (Flags.hasVectorReduction())
    return SDValue();
-
-  EVT VT = N0.getValueType();
-  if (N0.getOpcode() == Opc && !N0->getFlags().hasVectorReduction()) {
-    if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
-      if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
-        // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2))
-        if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, L, R))
-          return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
-        return SDValue();
-      }
-      if (N0.hasOneUse()) {
-        // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one
-        // use
-        SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1);
-        if (!OpNode.getNode())
-          return SDValue();
-        AddToWorklist(OpNode.getNode());
-        return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
-      }
-    }
-  }
-
-  if (N1.getOpcode() == Opc && !N1->getFlags().hasVectorReduction()) {
-    if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1.getOperand(1))) {
-      if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
-        // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2))
-        if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, R, L))
-          return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
-        return SDValue();
-      }
-      if (N1.hasOneUse()) {
-        // reassoc. (op x, (op y, c1)) -> (op (op x, y), c1) iff x+c1 has one
-        // use
-        SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0, N1.getOperand(0));
-        if (!OpNode.getNode())
-          return SDValue();
-        AddToWorklist(OpNode.getNode());
-        return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1));
-      }
-    }
-  }
-
+  if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N0, N1))
+    return Combined;
+  if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N1, N0))
+    return Combined;
  return SDValue();
 }

@ -2193,7 +2192,7 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
    return NewSel;

  // reassociate add
-  if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1, N->getFlags()))
+  if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags()))
    return RADD;

  // fold ((0-A) + B) -> B-A
@ -3275,7 +3274,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
                                     N0.getOperand(1), N1));

  // reassociate mul
-  if (SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags()))
+  if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags()))
    return RMUL;

  return SDValue();
@ -4799,7 +4798,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
    return NewSel;

  // reassociate and
-  if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags()))
+  if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags()))
    return RAND;

  // Try to convert a constant mask AND into a shuffle clear mask.
@ -5525,7 +5524,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
    return BSwap;

  // reassociate or
-  if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags()))
+  if (SDValue ROR = reassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags()))
    return ROR;

  // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
@ -6412,7 +6411,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
    return NewSel;

  // reassociate xor
-  if (SDValue RXOR = ReassociateOps(ISD::XOR, DL, N0, N1, N->getFlags()))
+  if (SDValue RXOR = reassociateOps(ISD::XOR, DL, N0, N1, N->getFlags()))
    return RXOR;

  // fold !(x cc y) -> (x !cc y)
--- a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
+++ b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
@ -19,8 +19,8 @@ define zeroext i8 @fullGtU(i32 %i1, i32 %i2) {
 ; CHECK-NEXT: cmp [[BLOCKVAL1]], [[BLOCKVAL2]]
 ; CHECK-NEXT: b.ne
 ; Next BB
-; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], [[I2]]
-; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], [[I1]]
+; CHECK: add [[BLOCKBASE1:x[0-9]+]], [[I1]], [[BLOCKBASE]]
+; CHECK-NEXT: add [[BLOCKBASE2:x[0-9]+]], [[I2]], [[BLOCKBASE]]
 ; CHECK-NEXT: ldrb [[LOADEDVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #1]
 ; CHECK-NEXT: ldrb [[LOADEDVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #1]
 ; CHECK-NEXT: cmp [[LOADEDVAL1]], [[LOADEDVAL2]]
--- a/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/test/CodeGen/AMDGPU/calling-conventions.ll
@ -184,14 +184,14 @@ define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) {
 ; VI: s_and_b32 s1, s0, 0xffff0000
 ; VI: s_add_i32 s0, s0, 1
 ; VI: s_and_b32 s0, s0, 0xffff
-; VI: s_or_b32 s0, s0, s1
+; VI: s_or_b32 s0, s1, s0
 ; VI: s_add_i32 s0, s0, 0x10000
 ; VI: v_mov_b32_e32 v0, s0

 ; SI: s_lshl_b32 s1, s1, 16
 ; SI: s_add_i32 s0, s0, 1
 ; SI: s_and_b32 s0, s0, 0xffff
-; SI: s_or_b32 s0, s0, s1
+; SI: s_or_b32 s0, s1, s0
 ; SI: s_add_i32 s0, s0, 0x10000
 define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) {
  %add = add <2 x i16> %arg0, <i16 1, i16 1>
--- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@ -289,18 +289,18 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; SI-NEXT:    v_cvt_f32_ubyte2_e32 v2, v1
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v1
 ; SI-NEXT:    v_cvt_f32_ubyte1_e32 v1, v6
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 9, v4
 ; SI-NEXT:    v_and_b32_e32 v7, s12, v7
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 9, v4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_or_b32_e32 v1, v7, v6
+; SI-NEXT:    v_or_b32_e32 v0, v6, v7
 ; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; SI-NEXT:    v_and_b32_e32 v0, s12, v4
-; SI-NEXT:    v_or_b32_e32 v0, v0, v5
-; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x900, v1
-; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT:    v_and_b32_e32 v1, s12, v4
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x900, v0
+; SI-NEXT:    v_or_b32_e32 v1, v5, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x9000000, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
@ -335,8 +335,8 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* n
 ; VI-NEXT:    v_add_u16_e32 v9, 9, v5
 ; VI-NEXT:    v_add_u16_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v7
-; VI-NEXT:    v_or_b32_sdwa v0, v9, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; VI-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_sdwa v0, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_e32 v0, s8, v0
 ; VI-NEXT:    v_add_u16_sdwa v1, v1, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
--- a/test/CodeGen/AMDGPU/shl_add_constant.ll
+++ b/test/CodeGen/AMDGPU/shl_add_constant.ll
@ -71,7 +71,7 @@ define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, [8
 ; FUNC-LABEL: {{^}}test_add_shl_add_constant_inv:
 ; SI-DAG: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Y:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
 ; SI: s_lshl_b32 [[SHL3:s[0-9]+]], s[[X]], 3
-; SI: s_add_i32 [[TMP:s[0-9]+]], s[[Y]], [[SHL3]]
+; SI: s_add_i32 [[TMP:s[0-9]+]], [[SHL3]], s[[Y]]
 ; SI: s_addk_i32 [[TMP]], 0x3d8
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[TMP]]
 ; SI: buffer_store_dword [[VRESULT]]
--- a/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@ -216,7 +216,7 @@ define amdgpu_kernel void @widen_v2i8_constant_load(<2 x i8> addrspace(4)* %arg)
 ; SI-NEXT:    s_add_i32 s0, s0, 12
 ; SI-NEXT:    s_or_b32 s0, s0, 4
 ; SI-NEXT:    s_and_b32 s0, s0, 0xff
-; SI-NEXT:    s_or_b32 s0, s0, s1
+; SI-NEXT:    s_or_b32 s0, s1, s0
 ; SI-NEXT:    s_addk_i32 s0, 0x2c00
 ; SI-NEXT:    s_or_b32 s0, s0, 0x300
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
--- a/test/CodeGen/ARM/and-load-combine.ll
+++ b/test/CodeGen/ARM/and-load-combine.ll
@ -414,35 +414,35 @@ entry:
 define arm_aapcscc zeroext i1 @cmp_and8_short_int(i16* nocapture readonly %a, i32* nocapture readonly %b) {
 ; ARM-LABEL: cmp_and8_short_int:
 ; ARM:       @ %bb.0: @ %entry
-; ARM-NEXT:    ldrb r0, [r0]
 ; ARM-NEXT:    ldrb r1, [r1]
-; ARM-NEXT:    and r0, r1, r0
+; ARM-NEXT:    ldrb r0, [r0]
+; ARM-NEXT:    and r0, r0, r1
 ; ARM-NEXT:    clz r0, r0
 ; ARM-NEXT:    lsr r0, r0, #5
 ; ARM-NEXT:    bx lr
 ;
 ; ARMEB-LABEL: cmp_and8_short_int:
 ; ARMEB:       @ %bb.0: @ %entry
-; ARMEB-NEXT:    ldrb r0, [r0, #1]
 ; ARMEB-NEXT:    ldrb r1, [r1, #3]
-; ARMEB-NEXT:    and r0, r1, r0
+; ARMEB-NEXT:    ldrb r0, [r0, #1]
+; ARMEB-NEXT:    and r0, r0, r1
 ; ARMEB-NEXT:    clz r0, r0
 ; ARMEB-NEXT:    lsr r0, r0, #5
 ; ARMEB-NEXT:    bx lr
 ;
 ; THUMB1-LABEL: cmp_and8_short_int:
 ; THUMB1:       @ %bb.0: @ %entry
-; THUMB1-NEXT:    ldrb r0, [r0]
 ; THUMB1-NEXT:    ldrb r1, [r1]
-; THUMB1-NEXT:    ands r1, r0
-; THUMB1-NEXT:    rsbs r0, r1, #0
-; THUMB1-NEXT:    adcs r0, r1
+; THUMB1-NEXT:    ldrb r2, [r0]
+; THUMB1-NEXT:    ands r2, r1
+; THUMB1-NEXT:    rsbs r0, r2, #0
+; THUMB1-NEXT:    adcs r0, r2
 ; THUMB1-NEXT:    bx lr
 ;
 ; THUMB2-LABEL: cmp_and8_short_int:
 ; THUMB2:       @ %bb.0: @ %entry
-; THUMB2-NEXT:    ldrb r0, [r0]
 ; THUMB2-NEXT:    ldrb r1, [r1]
+; THUMB2-NEXT:    ldrb r0, [r0]
 ; THUMB2-NEXT:    ands r0, r1
 ; THUMB2-NEXT:    clz r0, r0
 ; THUMB2-NEXT:    lsrs r0, r0, #5
@ -846,7 +846,7 @@ define arm_aapcscc i1 @test6(i8* %x, i8 %y, i8 %z) {
 ; ARM-LABEL: test6:
 ; ARM:       @ %bb.0: @ %entry
 ; ARM-NEXT:    ldrb r0, [r0]
-; ARM-NEXT:    and r0, r0, r1
+; ARM-NEXT:    and r0, r1, r0
 ; ARM-NEXT:    uxtb r1, r2
 ; ARM-NEXT:    sub r0, r0, r1
 ; ARM-NEXT:    clz r0, r0
@ -856,7 +856,7 @@ define arm_aapcscc i1 @test6(i8* %x, i8 %y, i8 %z) {
 ; ARMEB-LABEL: test6:
 ; ARMEB:       @ %bb.0: @ %entry
 ; ARMEB-NEXT:    ldrb r0, [r0]
-; ARMEB-NEXT:    and r0, r0, r1
+; ARMEB-NEXT:    and r0, r1, r0
 ; ARMEB-NEXT:    uxtb r1, r2
 ; ARMEB-NEXT:    sub r0, r0, r1
 ; ARMEB-NEXT:    clz r0, r0
@ -893,7 +893,7 @@ define arm_aapcscc i1 @test7(i16* %x, i16 %y, i8 %z) {
 ; ARM-LABEL: test7:
 ; ARM:       @ %bb.0: @ %entry
 ; ARM-NEXT:    ldrb r0, [r0]
-; ARM-NEXT:    and r0, r0, r1
+; ARM-NEXT:    and r0, r1, r0
 ; ARM-NEXT:    uxtb r1, r2
 ; ARM-NEXT:    sub r0, r0, r1
 ; ARM-NEXT:    clz r0, r0
@ -903,7 +903,7 @@ define arm_aapcscc i1 @test7(i16* %x, i16 %y, i8 %z) {
 ; ARMEB-LABEL: test7:
 ; ARMEB:       @ %bb.0: @ %entry
 ; ARMEB-NEXT:    ldrb r0, [r0, #1]
-; ARMEB-NEXT:    and r0, r0, r1
+; ARMEB-NEXT:    and r0, r1, r0
 ; ARMEB-NEXT:    uxtb r1, r2
 ; ARMEB-NEXT:    sub r0, r0, r1
 ; ARMEB-NEXT:    clz r0, r0
@ -1550,34 +1550,34 @@ define arm_aapcscc i64 @test26(i64* nocapture %p) {
  ret i64 %and
 }

+define void @test27(i32* nocapture %ptr) {
 ; ARM-LABEL: test27:
-; ARM:       @ %bb.0:
+; ARM:       @ %bb.0: @ %entry
 ; ARM-NEXT:    ldrb r1, [r0, #1]
 ; ARM-NEXT:    lsl r1, r1, #16
 ; ARM-NEXT:    str r1, [r0]
 ; ARM-NEXT:    bx lr
 ;
 ; ARMEB-LABEL: test27:
-; ARMEB:     @ %bb.0:
-; ARMEB-NEXT:  ldrb r1, [r0, #2]
-; ARMEB-NEXT:  lsl r1, r1, #16
-; ARMEB-NEXT:  str r1, [r0]
-; ARMEB-NEXT:  bx lr
+; ARMEB:       @ %bb.0: @ %entry
+; ARMEB-NEXT:    ldrb r1, [r0, #2]
+; ARMEB-NEXT:    lsl r1, r1, #16
+; ARMEB-NEXT:    str r1, [r0]
+; ARMEB-NEXT:    bx lr
 ;
 ; THUMB1-LABEL: test27:
-; THUMB1:     @ %bb.0:
-; THUMB1-NEXT:  ldrb r1, [r0, #1]
-; THUMB1-NEXT:  lsls r1, r1, #16
-; THUMB1-NEXT:  str r1, [r0]
-; THUMB1-NEXT:  bx lr
+; THUMB1:       @ %bb.0: @ %entry
+; THUMB1-NEXT:    ldrb r1, [r0, #1]
+; THUMB1-NEXT:    lsls r1, r1, #16
+; THUMB1-NEXT:    str r1, [r0]
+; THUMB1-NEXT:    bx lr
 ;
 ; THUMB2-LABEL: test27:
-; THUMB2:       @ %bb.0:
+; THUMB2:       @ %bb.0: @ %entry
 ; THUMB2-NEXT:    ldrb r1, [r0, #1]
 ; THUMB2-NEXT:    lsls r1, r1, #16
 ; THUMB2-NEXT:    str r1, [r0]
 ; THUMB2-NEXT:    bx lr
-define void @test27(i32* nocapture %ptr) {
 entry:
  %0 = load i32, i32* %ptr, align 4
  %and = and i32 %0, 65280
--- a/test/CodeGen/ARM/load-combine-big-endian.ll
+++ b/test/CodeGen/ARM/load-combine-big-endian.ll
@ -528,7 +528,7 @@ define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) {
 ; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24)
 define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) {
 ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2:
-; CHECK: add r0, r0, r1
+; CHECK: add r0, r1, r0
 ; CHECK-NEXT: mov r1, #65280
 ; CHECK-NEXT: mov r2, #16711680
 ; CHECK-NEXT: ldr r0, [r0, #13]
@ -540,7 +540,7 @@ define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) {
 ; CHECK-NEXT: mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2:
-; CHECK-ARMv6: add r0, r0, r1
+; CHECK-ARMv6: add r0, r1, r0
 ; CHECK-ARMv6-NEXT: ldr r0, [r0, #13]
 ; CHECK-ARMv6-NEXT: rev r0, r0
 ; CHECK-ARMv6-NEXT: bx  lr
--- a/test/CodeGen/ARM/load-combine.ll
+++ b/test/CodeGen/ARM/load-combine.ll
@ -479,12 +479,12 @@ define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) {
 ; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24)
 define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) {
 ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2:
-; CHECK: add r0, r0, r1
+; CHECK: add r0, r1, r0
 ; CHECK-NEXT: ldr r0, [r0, #13]
 ; CHECK-NEXT: mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2:
-; CHECK-ARMv6: add r0, r0, r1
+; CHECK-ARMv6: add r0, r1, r0
 ; CHECK-ARMv6-NEXT: ldr r0, [r0, #13]
 ; CHECK-ARMv6-NEXT: bx  lr
  %tmp = add nuw nsw i32 %i, 4
--- a/test/CodeGen/SystemZ/buildvector-00.ll
+++ b/test/CodeGen/SystemZ/buildvector-00.ll
@ -13,7 +13,7 @@ define void @f1(<2 x i64> %a0) {
 ; CHECK-NEXT:    vn %v0, %v0, %v0
 ; CHECK-NEXT:    vno %v2, %v2, %v2
 ; CHECK-NEXT:    vceqg %v0, %v0, %v1
-; CHECK-NEXT:    vx %v0, %v2, %v0
+; CHECK-NEXT:    vx %v0, %v0, %v2
 ; CHECK-NEXT:    vnc %v0, %v2, %v0
 ; CHECK-NEXT:    vlgvf %r0, %v0, 1
 ; CHECK-NEXT:    tmll %r0, 1
--- a/test/CodeGen/Thumb2/constant-hoisting.ll
+++ b/test/CodeGen/Thumb2/constant-hoisting.ll
@ -17,16 +17,16 @@ define i32 @test_values(i32 %a, i32 %b) minsize optsize {
 ; CHECK-V6M-NEXT:    adds r0, r1, r0
 ; CHECK-V6M-NEXT:    bx lr
 ; CHECK-V6M-NEXT:  .LBB0_5:
-; CHECK-V6M-NEXT:    adds r0, r1, r0
+; CHECK-V6M-NEXT:    adds r0, r0, r1
 ; CHECK-V6M-NEXT:    adds r0, r0, #4
 ; CHECK-V6M-NEXT:  .LBB0_6:
 ; CHECK-V6M-NEXT:    bx lr
 ; CHECK-V6M-NEXT:  .LBB0_7:
-; CHECK-V6M-NEXT:    adds r0, r1, r0
+; CHECK-V6M-NEXT:    adds r0, r0, r1
 ; CHECK-V6M-NEXT:    adds r0, r0, #1
 ; CHECK-V6M-NEXT:    bx lr
 ; CHECK-V6M-NEXT:  .LBB0_8:
-; CHECK-V6M-NEXT:    adds r0, r1, r0
+; CHECK-V6M-NEXT:    adds r0, r0, r1
 ; CHECK-V6M-NEXT:    adds r0, r0, #2
 ; CHECK-V6M-NEXT:    bx lr
 ; CHECK-V6M-NEXT:    .p2align 2
--- a/test/CodeGen/WebAssembly/address-offsets.ll
+++ b/test/CodeGen/WebAssembly/address-offsets.ll
@ -165,10 +165,10 @@ define i32 @load_test9() {
 ; NON-PIC-NEXT:  i32.load  $push4=, 0($pop3){{$}}
 ; NON-PIC-NEXT:  return    $pop4{{$}}

-; PIC-NEXT:   global.get $push2=, g@GOT{{$}}
 ; PIC-NEXT:   i32.const $push0=, 2{{$}}
 ; PIC-NEXT:   i32.shl   $push1=, $0, $pop0{{$}}
-; PIC-NEXT:   i32.add   $push3=, $pop2, $pop1{{$}}
+; PIC-NEXT:   global.get $push2=, g@GOT{{$}}
+; PIC-NEXT:   i32.add   $push3=, $pop1, $pop2{{$}}
 ; PIC-NEXT:   i32.const $push4=, -40{{$}}
 ; PIC-NEXT:   i32.add   $push5=, $pop3, $pop4{{$}}
 ; PIC-NEXT:   i32.load  $push6=, 0($pop5){{$}}
@ -206,7 +206,7 @@ define i32 @load_test11_noinbounds(i32* %p) {
 ; CHECK-NEXT: .functype load_test12 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
-; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.add   $push2=, $pop1, $0{{$}}
 ; CHECK-NEXT: i32.const $push3=, 40{{$}}
 ; CHECK-NEXT: i32.add   $push4=, $pop2, $pop3{{$}}
 ; CHECK-NEXT: i32.load  $push5=, 0($pop4){{$}}
@ -222,7 +222,7 @@ define i32 @load_test12(i32* %p, i32 %n) {
 ; CHECK-NEXT: .functype load_test13 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
-; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.add   $push2=, $pop1, $0{{$}}
 ; CHECK-NEXT: i32.const $push3=, 40{{$}}
 ; CHECK-NEXT: i32.add   $push4=, $pop2, $pop3{{$}}
 ; CHECK-NEXT: i32.load  $push5=, 0($pop4){{$}}
@ -284,7 +284,7 @@ define i32 @load_test16(i32* %p, i32 %n) {
 ; CHECK-NEXT: .functype load_test17 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
-; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.add   $push2=, $pop1, $0{{$}}
 ; CHECK-NEXT: i32.const $push3=, 40{{$}}
 ; CHECK-NEXT: i32.add   $push4=, $pop2, $pop3{{$}}
 ; CHECK-NEXT: i32.load  $push5=, 0($pop4){{$}}
@ -314,7 +314,7 @@ define i32 @load_test18(i32* %p, i32 %n) {
 ; CHECK-NEXT: .functype load_test19 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
-; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.add   $push2=, $pop1, $0{{$}}
 ; CHECK-NEXT: i32.const $push3=, 40{{$}}
 ; CHECK-NEXT: i32.add   $push4=, $pop2, $pop3{{$}}
 ; CHECK-NEXT: i32.load  $push5=, 0($pop4){{$}}
@ -342,7 +342,7 @@ define i32 @load_test20(i32* %p) {
 ; CHECK-NEXT: .functype load_test21 (i32, i32) -> (i32){{$}}
 ; CHECK-NEXT: i32.const $push0=, 2{{$}}
 ; CHECK-NEXT: i32.shl   $push1=, $1, $pop0{{$}}
-; CHECK-NEXT: i32.add   $push2=, $0, $pop1{{$}}
+; CHECK-NEXT: i32.add   $push2=, $pop1, $0{{$}}
 ; CHECK-NEXT: i32.const $push3=, -40{{$}}
 ; CHECK-NEXT: i32.add   $push4=, $pop2, $pop3{{$}}
 ; CHECK-NEXT: i32.load  $push5=, 0($pop4){{$}}
@ -501,10 +501,10 @@ define void @store_test9(i32 %i) {
 ; NON-PIC-NEXT:  i32.const $push2=, g-40{{$}}
 ; NON-PIC-NEXT:  i32.add   $push3=, $pop1, $pop2{{$}}
 ; NON-PIC-NEXT:  i32.store 0($pop3), $1{{$}}
-; PIC-NEXT: global.get $push2=, g@GOT{{$}}
 ; PIC-NEXT: i32.const  $push0=, 2{{$}}
 ; PIC-NEXT: i32.shl    $push1=, $0, $pop0{{$}}
-; PIC-NEXT: i32.add    $push3=, $pop2, $pop1{{$}}
+; PIC-NEXT: global.get $push2=, g@GOT{{$}}
+; PIC-NEXT: i32.add    $push3=, $pop1, $pop2{{$}}
 ; PIC-NEXT: i32.const  $push4=, -40{{$}}
 ; PIC-NEXT: i32.add    $push5=, $pop3, $pop4{{$}}
 ; PIC-NEXT: i32.store  0($pop5), $1{{$}}
@ -542,7 +542,7 @@ define void @store_test11_noinbounds(i32* %p, i32 %i) {
 ; CHECK-NEXT: .functype store_test12 (i32, i32, i32) -> (){{$}}
 ; NON-PIC-NEXT:  i32.const $push0=, 2{{$}}
 ; NON-PIC-NEXT:  i32.shl   $push1=, $1, $pop0{{$}}
-; NON-PIC-NEXT:  i32.add   $push2=, $0, $pop1{{$}}
+; NON-PIC-NEXT:  i32.add   $push2=, $pop1, $0{{$}}
 ; NON-PIC-NEXT:  i32.const $push3=, 40{{$}}
 ; NON-PIC-NEXT:  i32.add   $push4=, $pop2, $pop3{{$}}
 ; NON-PIC-NEXT:  i32.store 0($pop4), $2{{$}}
@ -558,7 +558,7 @@ define void @store_test12(i32* %p, i32 %n, i32 %i) {
 ; CHECK-NEXT: .functype store_test13 (i32, i32, i32) -> (){{$}}
 ; NON-PIC-NEXT:  i32.const $push0=, 2{{$}}
 ; NON-PIC-NEXT:  i32.shl   $push1=, $1, $pop0{{$}}
-; NON-PIC-NEXT:  i32.add   $push2=, $0, $pop1{{$}}
+; NON-PIC-NEXT:  i32.add   $push2=, $pop1, $0{{$}}
 ; NON-PIC-NEXT:  i32.const $push3=, 40{{$}}
 ; NON-PIC-NEXT:  i32.add   $push4=, $pop2, $pop3{{$}}
 ; NON-PIC-NEXT:  i32.store 0($pop4), $2{{$}}
@ -620,7 +620,7 @@ define void @store_test16(i32* %p, i32 %n, i32 %i) {
 ; CHECK-NEXT: .functype store_test17 (i32, i32, i32) -> (){{$}}
 ; NON-PIC-NEXT:  i32.const $push0=, 2{{$}}
 ; NON-PIC-NEXT:  i32.shl   $push1=, $1, $pop0{{$}}
-; NON-PIC-NEXT:  i32.add   $push2=, $0, $pop1{{$}}
+; NON-PIC-NEXT:  i32.add   $push2=, $pop1, $0{{$}}
 ; NON-PIC-NEXT:  i32.const $push3=, 40{{$}}
 ; NON-PIC-NEXT:  i32.add   $push4=, $pop2, $pop3{{$}}
 ; NON-PIC-NEXT:  i32.store 0($pop4), $2{{$}}
@ -650,7 +650,7 @@ define void @store_test18(i32* %p, i32 %n, i32 %i) {
 ; CHECK-NEXT: .functype store_test19 (i32, i32, i32) -> (){{$}}
 ; NON-PIC-NEXT:  i32.const $push0=, 2{{$}}
 ; NON-PIC-NEXT:  i32.shl   $push1=, $1, $pop0{{$}}
-; NON-PIC-NEXT:  i32.add   $push2=, $0, $pop1{{$}}
+; NON-PIC-NEXT:  i32.add   $push2=, $pop1, $0{{$}}
 ; NON-PIC-NEXT:  i32.const $push3=, 40{{$}}
 ; NON-PIC-NEXT:  i32.add   $push4=, $pop2, $pop3{{$}}
 ; NON-PIC-NEXT:  i32.store 0($pop4), $2{{$}}
@ -678,7 +678,7 @@ define void @store_test20(i32* %p, i32 %i) {
 ; CHECK-NEXT: .functype store_test21 (i32, i32, i32) -> (){{$}}
 ; NON-PIC-NEXT:  i32.const $push0=, 2{{$}}
 ; NON-PIC-NEXT:  i32.shl   $push1=, $1, $pop0{{$}}
-; NON-PIC-NEXT:  i32.add   $push2=, $0, $pop1{{$}}
+; NON-PIC-NEXT:  i32.add   $push2=, $pop1, $0{{$}}
 ; NON-PIC-NEXT:  i32.const $push3=, -40{{$}}
 ; NON-PIC-NEXT:  i32.add   $push4=, $pop2, $pop3{{$}}
 ; NON-PIC-NEXT:  i32.store 0($pop4), $2{{$}}
--- a/test/CodeGen/X86/add-ext.ll
+++ b/test/CodeGen/X86/add-ext.ll
@ -26,7 +26,7 @@ define i64 @add_nsw_sext_add(i32 %i, i64 %x) {
 ; CHECK-LABEL: add_nsw_sext_add:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movslq %edi, %rax
-; CHECK-NEXT:    leaq 5(%rsi,%rax), %rax
+; CHECK-NEXT:    leaq 5(%rax,%rsi), %rax
 ; CHECK-NEXT:    retq

  %add = add nsw i32 %i, 5
@ -73,7 +73,7 @@ define i8* @gep8(i32 %i, i8* %x) {
 ; CHECK-LABEL: gep8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movslq %edi, %rax
-; CHECK-NEXT:    leaq 5(%rsi,%rax), %rax
+; CHECK-NEXT:    leaq 5(%rax,%rsi), %rax
 ; CHECK-NEXT:    retq

  %add = add nsw i32 %i, 5
@ -128,7 +128,7 @@ define i128* @gep128(i32 %i, i128* %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movslq %edi, %rax
 ; CHECK-NEXT:    shlq $4, %rax
-; CHECK-NEXT:    leaq 80(%rsi,%rax), %rax
+; CHECK-NEXT:    leaq 80(%rax,%rsi), %rax
 ; CHECK-NEXT:    retq

  %add = add nsw i32 %i, 5
@ -169,12 +169,13 @@ define void @PR20134(i32* %a, i32 %i) {

 ; The same as @PR20134 but sign extension is replaced with zero extension
 define void @PR20134_zext(i32* %a, i32 %i) {
-; CHECK: # %bb.0:
-; CHECK-NEXT: movl %esi, %eax
-; CHECK-NEXT: movl 4(%rdi,%rax,4), %ecx
-; CHECK-NEXT: addl 8(%rdi,%rax,4), %ecx
-; CHECK-NEXT: movl %ecx, (%rdi,%rax,4)
-; CHECK-NEXT: retq
+; CHECK-LABEL: PR20134_zext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    movl 4(%rdi,%rax,4), %ecx
+; CHECK-NEXT:    addl 8(%rdi,%rax,4), %ecx
+; CHECK-NEXT:    movl %ecx, (%rdi,%rax,4)
+; CHECK-NEXT:    retq

  %add1 = add nuw i32 %i, 1
  %idx1 = zext i32 %add1 to i64
--- a/test/CodeGen/X86/combine-multiplies.ll
+++ b/test/CodeGen/X86/combine-multiplies.ll
@ -38,10 +38,10 @@ define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-NEXT:    imull $400, %ecx, %edx # imm = 0x190
-; CHECK-NEXT:    leal (%eax,%edx), %esi
+; CHECK-NEXT:    leal (%edx,%eax), %esi
 ; CHECK-NEXT:    movl $11, 2020(%esi,%ecx,4)
-; CHECK-NEXT:    movl $22, 2080(%eax,%edx)
-; CHECK-NEXT:    movl $33, 10080(%eax,%edx)
+; CHECK-NEXT:    movl $22, 2080(%edx,%eax)
+; CHECK-NEXT:    movl $33, 10080(%edx,%eax)
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    retl
 entry:
--- a/test/CodeGen/X86/load-combine.ll
+++ b/test/CodeGen/X86/load-combine.ll
@ -966,7 +966,7 @@ define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) {
 ; CHECK64-LABEL: load_i32_by_i8_base_offset_index_2:
 ; CHECK64:       # %bb.0:
 ; CHECK64-NEXT:    movl %esi, %eax
-; CHECK64-NEXT:    movl 13(%rdi,%rax), %eax
+; CHECK64-NEXT:    movl 13(%rax,%rdi), %eax
 ; CHECK64-NEXT:    retq
  %tmp = add nuw nsw i32 %i, 4
  %tmp2 = add nuw nsw i32 %i, 3
@ -1016,7 +1016,7 @@ define i32 @load_i32_by_i8_zaext_loads(i8* %arg, i32 %arg1) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl 12(%eax,%ecx), %eax
+; CHECK-NEXT:    movl 12(%ecx,%eax), %eax
 ; CHECK-NEXT:    retl
 ;
 ; CHECK64-LABEL: load_i32_by_i8_zaext_loads:
@ -1072,7 +1072,7 @@ define i32 @load_i32_by_i8_zsext_loads(i8* %arg, i32 %arg1) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl 12(%eax,%ecx), %eax
+; CHECK-NEXT:    movl 12(%ecx,%eax), %eax
 ; CHECK-NEXT:    retl
 ;
 ; CHECK64-LABEL: load_i32_by_i8_zsext_loads:
--- a/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll
@ -66,7 +66,7 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; GENERIC-NEXT:    movzbl 2(%r8,%rbx,4), %ebx
 ; GENERIC-NEXT:    shll $16, %ebx
 ; GENERIC-NEXT:    orl %eax, %ebx
-; GENERIC-NEXT:    xorl 16(%rdx,%rcx), %ebx
+; GENERIC-NEXT:    xorl 16(%rcx,%rdx), %ebx
 ; GENERIC-NEXT:    shrl $8, %edi
 ; GENERIC-NEXT:    movzbl 3(%r9,%rdi,4), %eax
 ; GENERIC-NEXT:    shll $24, %eax
@ -74,7 +74,7 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; GENERIC-NEXT:    movzbl 2(%r8,%rdi,4), %edi
 ; GENERIC-NEXT:    shll $16, %edi
 ; GENERIC-NEXT:    orl %eax, %edi
-; GENERIC-NEXT:    xorl 20(%rdx,%rcx), %edi
+; GENERIC-NEXT:    xorl 20(%rcx,%rdx), %edi
 ; GENERIC-NEXT:    movl %ebx, %eax
 ; GENERIC-NEXT:    shrl $24, %eax
 ; GENERIC-NEXT:    movb %al, (%rsi)
@ -156,8 +156,8 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    shll $16, %eax
 ; ATOM-NEXT:    orl %edi, %ebp
 ; ATOM-NEXT:    orl %r15d, %eax
-; ATOM-NEXT:    xorl 20(%rdx,%rcx), %ebp
-; ATOM-NEXT:    xorl 16(%rdx,%rcx), %eax
+; ATOM-NEXT:    xorl 20(%rcx,%rdx), %ebp
+; ATOM-NEXT:    xorl 16(%rcx,%rdx), %eax
 ; ATOM-NEXT:    movl %eax, %edi
 ; ATOM-NEXT:    shrl $16, %eax
 ; ATOM-NEXT:    shrl $24, %edi
--- a/test/CodeGen/X86/merge_store.ll
+++ b/test/CodeGen/X86/merge_store.ll
@ -44,7 +44,7 @@ entry:
 define void @indexed_store_merge(i64 %p, i8* %v) {
 ; CHECK-LABEL: indexed_store_merge:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl $0, 2(%rsi,%rdi)
+; CHECK-NEXT:    movl $0, 2(%rdi,%rsi)
 ; CHECK-NEXT:    movb $0, (%rsi)
 ; CHECK-NEXT:    retq
 entry:
--- a/test/CodeGen/X86/sad.ll
+++ b/test/CodeGen/X86/sad.ll
@ -1403,18 +1403,18 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
 ; SSE2-NEXT:    movdqu (%rdi), %xmm0
 ; SSE2-NEXT:    movdqu (%rsi), %xmm1
 ; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movdqu (%rdx), %xmm0
+; SSE2-NEXT:    movdqu (%rcx), %xmm2
+; SSE2-NEXT:    psadbw %xmm0, %xmm2
 ; SSE2-NEXT:    movl $1, %eax
 ; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    movdqu (%rdx), %xmm1
-; SSE2-NEXT:    movdqu (%rcx), %xmm2
-; SSE2-NEXT:    psadbw %xmm1, %xmm2
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
 ; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: sad_unroll_nonzero_initial:
@ -1425,8 +1425,8 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
 ; AVX1-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
 ; AVX1-NEXT:    movl $1, %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
@ -1438,12 +1438,12 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
 ; AVX2:       # %bb.0: # %bb
 ; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
 ; AVX2-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
-; AVX2-NEXT:    movl $1, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
-; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqu (%rdx), %xmm1
 ; AVX2-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
-; AVX2-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    movl $1, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vpaddd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@ -1458,12 +1458,12 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
 ; AVX512:       # %bb.0: # %bb
 ; AVX512-NEXT:    vmovdqu (%rdi), %xmm0
 ; AVX512-NEXT:    vpsadbw (%rsi), %xmm0, %xmm0
-; AVX512-NEXT:    movl $1, %eax
-; AVX512-NEXT:    vmovd %eax, %xmm1
-; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vmovdqu (%rdx), %xmm1
 ; AVX512-NEXT:    vpsadbw (%rcx), %xmm1, %xmm1
-; AVX512-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; AVX512-NEXT:    movl $1, %eax
+; AVX512-NEXT:    vmovd %eax, %xmm2
+; AVX512-NEXT:    vpaddd %zmm2, %zmm1, %zmm1
+; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
--- a/test/CodeGen/X86/vector-ext-logic.ll
+++ b/test/CodeGen/X86/vector-ext-logic.ll
@ -146,7 +146,7 @@ define <8 x i16> @zext_and_v8i16(<8 x i8> %x, <8 x i8> %y) {
 ;
 ; AVX2-LABEL: zext_and_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vandps %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    retq
  %xz = zext <8 x i8> %x to <8 x i16>