- Improved v8i16 shuffle lowering. It now uses pshuflw and pshufhw as much as

possible before resorting to pextrw and pinsrw. - Better codegen for v4i32 shuffles masquerading as v8i16 or v16i8 shuffles. - Improves (i16 extract_vector_element 0) codegen by recognizing (i32 extract_vector_element 0) does not require a pextrw. llvm-svn: 44836
2024-10-19 02:52:53 +02:00 · 2007-12-11 01:46:18 +00:00 · 2007-12-11 01:46:18 +00:00 · f6c2838f36
commit f6c2838f36
parent 8b194d1718
3 changed files with 312 additions and 97 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -23,6 +23,7 @@
 #include "llvm/GlobalVariable.h"
 #include "llvm/Function.h"
 #include "llvm/Intrinsics.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/VectorExtras.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@ -35,6 +36,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ParameterAttributes.h"
 using namespace llvm;
@ -2714,7 +2716,7 @@ static bool isPSHUFHW_PSHUFLWMask(SDNode *N) {
    if (Arg.getOpcode() == ISD::UNDEF) continue;
    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
-    if (Val > 4)
+    if (Val >= 4)
      return false;
  }

@ -3130,6 +3132,8 @@ static SDOperand LowerBuildVectorv8i16(SDOperand Op, unsigned NonZeros,
  return V;
 }

+/// is4WideVector - Returns true if the specific v8i16 or v16i8 vector is
+/// actually just a 4 wide vector. e.g. <a, a, y, y, d, d, x, x>
 SDOperand
 X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
  // All zero's are handled with pxor, all one's are handled with pcmpeqd.
@ -3154,7 +3158,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
  unsigned NumNonZero = 0;
  unsigned NonZeros = 0;
  unsigned NumNonZeroImms = 0;
-  std::set<SDOperand> Values;
+  SmallSet<SDOperand, 8> Values;
  for (unsigned i = 0; i < NumElems; ++i) {
    SDOperand Elt = Op.getOperand(i);
    if (Elt.getOpcode() != ISD::UNDEF) {
@ -3314,59 +3318,179 @@ static
 SDOperand LowerVECTOR_SHUFFLEv8i16(SDOperand V1, SDOperand V2,
                                   SDOperand PermMask, SelectionDAG &DAG,
                                   TargetLowering &TLI) {
+  SDOperand NewV;
  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(8);
  MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
-  if (isPSHUFHW_PSHUFLWMask(PermMask.Val)) {
-    // Handle v8i16 shuffle high / low shuffle node pair.
-    SmallVector<SDOperand, 8> MaskVec;
-    for (unsigned i = 0; i != 4; ++i)
-      MaskVec.push_back(PermMask.getOperand(i));
-    for (unsigned i = 4; i != 8; ++i)
-      MaskVec.push_back(DAG.getConstant(i, MaskEVT));
-    SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
-    V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, V1, V2, Mask);
-    MaskVec.clear();
-    for (unsigned i = 0; i != 4; ++i)
-      MaskVec.push_back(DAG.getConstant(i, MaskEVT));
-    for (unsigned i = 4; i != 8; ++i)
-      MaskVec.push_back(PermMask.getOperand(i));
-    Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
-    return DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, V1, V2, Mask);
+  MVT::ValueType PtrVT = TLI.getPointerTy();
+  SmallVector<SDOperand, 8> MaskElts(PermMask.Val->op_begin(),
+                                     PermMask.Val->op_end());
+
+  // First record which half of which vector the low elements come from.
+  SmallVector<unsigned, 4> LowQuad(4);
+  for (unsigned i = 0; i < 4; ++i) {
+    SDOperand Elt = MaskElts[i];
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+    int QuadIdx = EltIdx / 4;
+    ++LowQuad[QuadIdx];
+  }
+  int BestLowQuad = -1;
+  unsigned MaxQuad = 1;
+  for (unsigned i = 0; i < 4; ++i) {
+    if (LowQuad[i] > MaxQuad) {
+      BestLowQuad = i;
+      MaxQuad = LowQuad[i];
+    }
  }

-  // Lower than into extracts and inserts but try to do as few as possible.
+  // Record which half of which vector the high elements come from.
+  SmallVector<unsigned, 4> HighQuad(4);
+  for (unsigned i = 4; i < 8; ++i) {
+    SDOperand Elt = MaskElts[i];
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+    int QuadIdx = EltIdx / 4;
+    ++HighQuad[QuadIdx];
+  }
+  int BestHighQuad = -1;
+  MaxQuad = 1;
+  for (unsigned i = 0; i < 4; ++i) {
+    if (HighQuad[i] > MaxQuad) {
+      BestHighQuad = i;
+      MaxQuad = HighQuad[i];
+    }
+  }
+
+  // If it's possible to sort parts of either half with PSHUF{H|L}W, then do it.
+  if (BestLowQuad != -1 || BestHighQuad != -1) {
+    // First sort the 4 chunks in order using shufpd.
+    SmallVector<SDOperand, 8> MaskVec;
+    if (BestLowQuad != -1)
+      MaskVec.push_back(DAG.getConstant(BestLowQuad, MVT::i32));
+    else
+      MaskVec.push_back(DAG.getConstant(0, MVT::i32));
+    if (BestHighQuad != -1)
+      MaskVec.push_back(DAG.getConstant(BestHighQuad, MVT::i32));
+    else
+      MaskVec.push_back(DAG.getConstant(1, MVT::i32));
+    SDOperand Mask= DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, &MaskVec[0],2);
+    NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v2i64,
+                       DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, V1),
+                       DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, V2), Mask);
+    NewV = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, NewV);
+
+    // Now sort high and low parts separately.
+    BitVector InOrder(8);
+    if (BestLowQuad != -1) {
+      // Sort lower half in order using PSHUFLW.
+      MaskVec.clear();
+      bool AnyOutOrder = false;
+      for (unsigned i = 0; i != 4; ++i) {
+        SDOperand Elt = MaskElts[i];
+        if (Elt.getOpcode() == ISD::UNDEF) {
+          MaskVec.push_back(Elt);
+          InOrder.set(i);
+        } else {
+          unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+          if (EltIdx != i)
+            AnyOutOrder = true;
+          MaskVec.push_back(DAG.getConstant(EltIdx % 4, MaskEVT));
+          // If this element is in the right place after this shuffle, then
+          // remember it.
+          if ((int)(EltIdx / 4) == BestLowQuad)
+            InOrder.set(i);
+        }
+      }
+      if (AnyOutOrder) {
+        for (unsigned i = 4; i != 8; ++i)
+          MaskVec.push_back(DAG.getConstant(i, MaskEVT));
+        SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
+        NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, NewV, NewV, Mask);
+      }
+    }
+
+    if (BestHighQuad != -1) {
+      // Sort high half in order using PSHUFHW if possible.
+      MaskVec.clear();
+      for (unsigned i = 0; i != 4; ++i)
+        MaskVec.push_back(DAG.getConstant(i, MaskEVT));
+      bool AnyOutOrder = false;
+      for (unsigned i = 4; i != 8; ++i) {
+        SDOperand Elt = MaskElts[i];
+        if (Elt.getOpcode() == ISD::UNDEF) {
+          MaskVec.push_back(Elt);
+          InOrder.set(i);
+        } else {
+          unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+          if (EltIdx != i)
+            AnyOutOrder = true;
+          MaskVec.push_back(DAG.getConstant((EltIdx % 4) + 4, MaskEVT));
+          // If this element is in the right place after this shuffle, then
+          // remember it.
+          if ((int)(EltIdx / 4) == BestHighQuad)
+            InOrder.set(i);
+        }
+      }
+      if (AnyOutOrder) {
+        SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
+        NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, NewV, NewV, Mask);
+      }
+    }
+
+    // The other elements are put in the right place using pextrw and pinsrw.
+    for (unsigned i = 0; i != 8; ++i) {
+      if (InOrder[i])
+        continue;
+      SDOperand Elt = MaskElts[i];
+      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+      if (EltIdx == i)
+        continue;
+      SDOperand ExtOp = (EltIdx < 8)
+        ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V1,
+                      DAG.getConstant(EltIdx, PtrVT))
+        : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V2,
+                      DAG.getConstant(EltIdx - 8, PtrVT));
+      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp,
+                         DAG.getConstant(i, PtrVT));
+    }
+    return NewV;
+  }
+
+  // PSHUF{H|L}W are not used. Lower into extracts and inserts but try to use
+  ///as few as possible.
  // First, let's find out how many elements are already in the right order.
  unsigned V1InOrder = 0;
  unsigned V1FromV1 = 0;
  unsigned V2InOrder = 0;
  unsigned V2FromV2 = 0;
-  SmallVector<unsigned, 8> V1Elts;
-  SmallVector<unsigned, 8> V2Elts;
+  SmallVector<SDOperand, 8> V1Elts;
+  SmallVector<SDOperand, 8> V2Elts;
  for (unsigned i = 0; i < 8; ++i) {
-    SDOperand Elt = PermMask.getOperand(i);
+    SDOperand Elt = MaskElts[i];
    if (Elt.getOpcode() == ISD::UNDEF) {
-      V1Elts.push_back(i);
-      V2Elts.push_back(i);
+      V1Elts.push_back(Elt);
+      V2Elts.push_back(Elt);
      ++V1InOrder;
      ++V2InOrder;
+      continue;
+    }
+    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+    if (EltIdx == i) {
+      V1Elts.push_back(Elt);
+      V2Elts.push_back(DAG.getConstant(i+8, MaskEVT));
+      ++V1InOrder;
+    } else if (EltIdx == i+8) {
+      V1Elts.push_back(Elt);
+      V2Elts.push_back(DAG.getConstant(i, MaskEVT));
+      ++V2InOrder;
+    } else if (EltIdx < 8) {
+      V1Elts.push_back(Elt);
+      ++V1FromV1;
    } else {
-      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
-      if (EltIdx == i) {
-        V1Elts.push_back(i);
-        V2Elts.push_back(i+8);
-        ++V1InOrder;
-      } else if (EltIdx == i+8) {
-        V1Elts.push_back(i+8);
-        V2Elts.push_back(i);
-        ++V2InOrder;
-      } else {
-        V1Elts.push_back(EltIdx);
-        V2Elts.push_back(EltIdx);
-        if (EltIdx < 8)
-          ++V1FromV1;
-        else
-          ++V2FromV2;
-      }
+      V2Elts.push_back(DAG.getConstant(EltIdx-8, MaskEVT));
+      ++V2FromV2;
    }
  }

@ -3377,33 +3501,92 @@ SDOperand LowerVECTOR_SHUFFLEv8i16(SDOperand V1, SDOperand V2,
    std::swap(V1FromV1, V2FromV2);
  }

-  MVT::ValueType PtrVT = TLI.getPointerTy();
-  if (V1FromV1) {
-    // If there are elements that are from V1 but out of place,
-    // then first sort them in place
-    SmallVector<SDOperand, 8> MaskVec;
-    for (unsigned i = 0; i < 8; ++i) {
-      unsigned EltIdx = V1Elts[i];
-      if (EltIdx >= 8)
-        MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
-      else
-        MaskVec.push_back(DAG.getConstant(EltIdx, MaskEVT));
+  if ((V1FromV1 + V1InOrder) != 8) {
+    // Some elements are from V2.
+    if (V1FromV1) {
+      // If there are elements that are from V1 but out of place,
+      // then first sort them in place
+      SmallVector<SDOperand, 8> MaskVec;
+      for (unsigned i = 0; i < 8; ++i) {
+        SDOperand Elt = V1Elts[i];
+        if (Elt.getOpcode() == ISD::UNDEF) {
+          MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
+          continue;
+        }
+        unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+        if (EltIdx >= 8)
+          MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
+        else
+          MaskVec.push_back(DAG.getConstant(EltIdx, MaskEVT));
+      }
+      SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
+      V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, V1, V1, Mask);
    }
-    SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
-    V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, V1, V1, Mask);
+
+    NewV = V1;
+    for (unsigned i = 0; i < 8; ++i) {
+      SDOperand Elt = V1Elts[i];
+      if (Elt.getOpcode() == ISD::UNDEF)
+        continue;
+      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+      if (EltIdx < 8)
+        continue;
+      SDOperand ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V2,
+                                    DAG.getConstant(EltIdx - 8, PtrVT));
+      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp,
+                         DAG.getConstant(i, PtrVT));
+    }
+    return NewV;
+  } else {
+    // All elements are from V1.
+    NewV = V1;
+    for (unsigned i = 0; i < 8; ++i) {
+      SDOperand Elt = V1Elts[i];
+      if (Elt.getOpcode() == ISD::UNDEF)
+        continue;
+      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+      SDOperand ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V1,
+                                    DAG.getConstant(EltIdx, PtrVT));
+      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp,
+                         DAG.getConstant(i, PtrVT));
+    }
+    return NewV;
+  }
+}
+
+/// RewriteAs4WideShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
+/// ones if possible. This can be done when every pair / quad of shuffle mask
+/// elements point to elements in the right sequence. e.g.
+/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
+static
+SDOperand RewriteAs4WideShuffle(SDOperand V1, SDOperand V2,
+                                SDOperand PermMask, SelectionDAG &DAG,
+                                TargetLowering &TLI) {
+  unsigned NumElems = PermMask.getNumOperands();
+  unsigned Scale = NumElems / 4;
+  SmallVector<SDOperand, 4> MaskVec;
+  for (unsigned i = 0; i < NumElems; i += Scale) {
+    unsigned StartIdx = ~0U;
+    for (unsigned j = 0; j < Scale; ++j) {
+      SDOperand Elt = PermMask.getOperand(i+j);
+      if (Elt.getOpcode() == ISD::UNDEF)
+        continue;
+      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
+      if (StartIdx == ~0U)
+        StartIdx = EltIdx - (EltIdx % Scale);
+      if (EltIdx != StartIdx + j)
+        return SDOperand();
+    }
+    if (StartIdx == ~0U)
+      MaskVec.push_back(DAG.getNode(ISD::UNDEF, MVT::i32));
+    else
+      MaskVec.push_back(DAG.getConstant(StartIdx / Scale, MVT::i32));
  }

-  // Now let's insert elements from the other vector.
-  for (unsigned i = 0; i < 8; ++i) {
-    unsigned EltIdx = V1Elts[i];
-    if (EltIdx < 8)
-      continue;
-    SDOperand ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V2,
-                                  DAG.getConstant(EltIdx - 8, PtrVT));
-    V1 = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V1, ExtOp,
-                     DAG.getConstant(i, PtrVT));
-  }
-  return V1;
+  V1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, V1);
+  V2 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, V2);
+  return DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v4i32, V1, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, MVT::v4i32, &MaskVec[0],4));
 }

 SDOperand
@ -3544,18 +3727,31 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
    }
  }

-  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
-  if (VT == MVT::v8i16)
-    return LowerVECTOR_SHUFFLEv8i16(V1, V2, PermMask, DAG, *this);
+  // If the shuffle can be rewritten as a 4 wide shuffle, then do it!
+  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+    SDOperand NewOp = RewriteAs4WideShuffle(V1, V2, PermMask, DAG, *this);
+    if (NewOp.Val)
+      return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG));
+  }

-  if (NumElems == 4 &&  MVT::getSizeInBits(VT) != 64) {
+  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
+  if (VT == MVT::v8i16) {
+    SDOperand NewOp = LowerVECTOR_SHUFFLEv8i16(V1, V2, PermMask, DAG, *this);
+    if (NewOp.Val)
+      return NewOp;
+  }
+
+  // Handle all 4 wide cases with a number of shuffles.
+  if (NumElems == 4 && MVT::getSizeInBits(VT) != 64) {
    // Don't do this for MMX.
    MVT::ValueType MaskVT = PermMask.getValueType();
    MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
    SmallVector<std::pair<int, int>, 8> Locs;
    Locs.reserve(NumElems);
-    SmallVector<SDOperand, 8> Mask1(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
-    SmallVector<SDOperand, 8> Mask2(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
+    SmallVector<SDOperand, 8> Mask1(NumElems,
+                                    DAG.getNode(ISD::UNDEF, MaskEVT));
+    SmallVector<SDOperand, 8> Mask2(NumElems,
+                                    DAG.getNode(ISD::UNDEF, MaskEVT));
    unsigned NumHi = 0;
    unsigned NumLo = 0;
    // If no more than two elements come from either vector. This can be
@ -3661,6 +3857,13 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
  MVT::ValueType VT = Op.getValueType();
  // TODO: handle v16i8.
  if (MVT::getSizeInBits(VT) == 16) {
+    SDOperand Vec = Op.getOperand(0);
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
+    if (Idx == 0)
+      return DAG.getNode(ISD::TRUNCATE, MVT::i16,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32,
+                                 DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, Vec),
+                                     Op.getOperand(1)));
    // Transform it so it match pextrw which produces a 32-bit result.
    MVT::ValueType EVT = (MVT::ValueType)(VT+1);
    SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, EVT,
@ -3669,7 +3872,6 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
                                    DAG.getValueType(VT));
    return DAG.getNode(ISD::TRUNCATE, VT, Assert);
  } else if (MVT::getSizeInBits(VT) == 32) {
-    SDOperand Vec = Op.getOperand(0);
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
    if (Idx == 0)
      return Op;
@ -3686,12 +3888,12 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
    SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                 &IdxVec[0], IdxVec.size());
+    SDOperand Vec = Op.getOperand(0);
    Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
                      Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
                       DAG.getConstant(0, getPointerTy()));
  } else if (MVT::getSizeInBits(VT) == 64) {
-    SDOperand Vec = Op.getOperand(0);
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
    if (Idx == 0)
      return Op;
@ -3706,6 +3908,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
    SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                 &IdxVec[0], IdxVec.size());
+    SDOperand Vec = Op.getOperand(0);
    Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
                      Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
--- a/test/CodeGen/X86/vec_shuffle-12.ll
+++ b/test/CodeGen/X86/vec_shuffle-12.ll
@ -1,37 +1,28 @@
 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2
 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | not grep punpck
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pextrw | count 7
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pinsrw | count 7
-; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshuf | count 2
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pextrw | count 4
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pinsrw | count 6
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshuflw | count 3
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshufhw | count 2

-define void @t1(<8 x i16>* %res, <8 x i16>* %A, <8 x i16>* %B) {
+define <8 x i16> @t1(<8 x i16>* %A, <8 x i16>* %B) {
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
 	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
-	store <8 x i16> %tmp3, <8 x i16>* %res
-	ret void
+	ret <8 x i16> %tmp3
 }

-define void @t2(<8 x i16>* %res, <8 x i16>* %A, <8 x i16>* %B) {
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 1, i32 2, i32 13, i32 4, i32 5, i32 6, i32 7 >
-	store <8 x i16> %tmp3, <8 x i16>* %res
-	ret void
+define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) {
+	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 9, i32 1, i32 2, i32 9, i32 4, i32 5, i32 6, i32 7 >
+	ret <8 x i16> %tmp
 }

-define void @t3(<8 x i16>* %res, <8 x i16>* %A, <8 x i16>* %B) {
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 >
-	store <8 x i16> %tmp3, <8 x i16>* %res
-	ret void
+define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) {
+	%tmp = shufflevector <8 x i16> %A, <8 x i16> %A, <8 x i32> < i32 8, i32 3, i32 2, i32 13, i32 7, i32 6, i32 5, i32 4 >
+	ret <8 x i16> %tmp
 }

-define void @t4(<8 x i16>* %res, <8 x i16>* %A, <8 x i16>* %B) {
-	%tmp1 = load <8 x i16>* %A
-	%tmp2 = load <8 x i16>* %B
-	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
-	store <8 x i16> %tmp3, <8 x i16>* %res
-	ret void
+define <8 x i16> @t4(<8 x i16> %A, <8 x i16> %B) {
+	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 7, i32 2, i32 3, i32 1, i32 5, i32 6, i32 5 >
+	ret <8 x i16> %tmp
 }
--- a/test/CodeGen/X86/vec_shuffle-13.ll
+++ b/test/CodeGen/X86/vec_shuffle-13.ll
@ -0,0 +1,21 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movlhps | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movss | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshufd | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshuflw | count 1
+; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pshufhw | count 1
+
+define <8 x i16> @t1(<8 x i16> %A, <8 x i16> %B) {
+	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 0, i32 1, i32 10, i32 11, i32 2, i32 3 >
+	ret <8 x i16> %tmp
+}
+
+define <8 x i16> @t2(<8 x i16> %A, <8 x i16> %B) {
+	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >
+	ret <8 x i16> %tmp
+}
+
+define <8 x i16> @t3(<8 x i16> %A, <8 x i16> %B) {
+	%tmp = shufflevector <8 x i16> %A, <8 x i16> %B, <8 x i32> < i32 0, i32 0, i32 3, i32 2, i32 4, i32 6, i32 4, i32 7 >
+	ret <8 x i16> %tmp
+}