Improved shuffle normalization to avoid using extract/build when we

can extract using different indexes for two vectors. Added a few tests for vector shuffles. llvm-svn: 59399
2024-11-24 03:33:20 +01:00 · 2008-11-16 05:06:27 +00:00 · 2008-11-16 05:06:27 +00:00 · b6661b480b
commit b6661b480b
parent 3b058783bc
4 changed files with 203 additions and 97 deletions
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuild.cpp
@ -2292,8 +2292,8 @@ void SelectionDAGLowering::visitExtractElement(User &I) {
 // Utility for visitShuffleVector - Returns true if the mask is mask starting
 // from SIndx and increasing to the element length (undefs are allowed).
 static bool SequentialMask(SDValue Mask, unsigned SIndx) {
-  unsigned NumElems = Mask.getNumOperands();
-  for (unsigned i = 0; i != NumElems; ++i) {
+  unsigned MaskNumElts = Mask.getNumOperands();
+  for (unsigned i = 0; i != MaskNumElts; ++i) {
    if (Mask.getOperand(i).getOpcode() != ISD::UNDEF) {
      unsigned Idx = cast<ConstantSDNode>(Mask.getOperand(i))->getZExtValue();
      if (Idx != i + SIndx)
@ -2304,161 +2304,187 @@ static bool SequentialMask(SDValue Mask, unsigned SIndx) {
 }

 void SelectionDAGLowering::visitShuffleVector(User &I) {
-  SDValue V1   = getValue(I.getOperand(0));
-  SDValue V2   = getValue(I.getOperand(1));
+  SDValue Srcs[2];
+  Srcs[0] = getValue(I.getOperand(0));
+  Srcs[1] = getValue(I.getOperand(1));
  SDValue Mask = getValue(I.getOperand(2));

  MVT VT = TLI.getValueType(I.getType());
-  MVT VT1 = V1.getValueType();
-  unsigned MaskNumElts = Mask.getNumOperands();
-  unsigned Src1NumElts = VT1.getVectorNumElements();
+  MVT SrcVT = Srcs[0].getValueType();
+  int MaskNumElts = Mask.getNumOperands();
+  int SrcNumElts = SrcVT.getVectorNumElements();

-  if (Src1NumElts == MaskNumElts) {
-    setValue(&I, DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask));
+  if (SrcNumElts == MaskNumElts) {
+    setValue(&I, DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Srcs[0], Srcs[1], Mask));
    return;
  }

  // Normalize the shuffle vector since mask and vector length don't match.
-  if (Src1NumElts < MaskNumElts && MaskNumElts % Src1NumElts == 0) {
-    // We can concat vectors to make the mask and input vector match.
-    if (Src1NumElts*2 == MaskNumElts && SequentialMask(Mask, 0)) {
-      // The shuffle is concatenating two vectors.
-      setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, VT, V1, V2));
+  MVT MaskEltVT = Mask.getValueType().getVectorElementType();
+
+  if (SrcNumElts < MaskNumElts && MaskNumElts % SrcNumElts == 0) {
+    // Mask is longer than the source vectors and is a multiple of the source
+    // vectors.  We can use concatenate vector to make the mask and vectors
+    // length match.
+    if (SrcNumElts*2 == MaskNumElts && SequentialMask(Mask, 0)) {
+      // The shuffle is concatenating two vectors together.
+      setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, VT, Srcs[0], Srcs[1]));
      return;
    }

-    // Pad both vectors with undefs to the same size as the mask.
-    unsigned NumConcat = MaskNumElts / Src1NumElts;
-    std::vector<SDValue> UnOps(Src1NumElts,
-                               DAG.getNode(ISD::UNDEF, 
-                                           VT1.getVectorElementType()));
-    SDValue UndefVal = DAG.getNode(ISD::BUILD_VECTOR, VT1,
-                                   &UnOps[0], UnOps.size());
+    // Pad both vectors with undefs to make them the same length as the mask.
+    unsigned NumConcat = MaskNumElts / SrcNumElts;
+    SDValue UndefVal = DAG.getNode(ISD::UNDEF, SrcVT);

    SmallVector<SDValue, 8> MOps1, MOps2;
-    MOps1.push_back(V1);
-    MOps2.push_back(V2);
+    MOps1.push_back(Srcs[0]);
+    MOps2.push_back(Srcs[1]);
    for (unsigned i = 1; i != NumConcat; ++i) {
      MOps1.push_back(UndefVal);
      MOps2.push_back(UndefVal);
    }
-    V1 = DAG.getNode(ISD::CONCAT_VECTORS, VT, &MOps1[0], MOps1.size());
-    V2 = DAG.getNode(ISD::CONCAT_VECTORS, VT, &MOps2[0], MOps2.size());
+    Srcs[0] = DAG.getNode(ISD::CONCAT_VECTORS, VT, &MOps1[0], MOps1.size());
+    Srcs[1] = DAG.getNode(ISD::CONCAT_VECTORS, VT, &MOps2[0], MOps2.size());
    
    // Readjust mask for new input vector length.
    SmallVector<SDValue, 8> MappedOps;
-    for (unsigned i = 0; i != MaskNumElts; ++i) {
+    for (int i = 0; i != MaskNumElts; ++i) {
      if (Mask.getOperand(i).getOpcode() == ISD::UNDEF) {
        MappedOps.push_back(Mask.getOperand(i));
      } else {
-        unsigned Idx = cast<ConstantSDNode>(Mask.getOperand(i))->getZExtValue();
-        if (Idx < Src1NumElts) {
-          MappedOps.push_back(DAG.getConstant(Idx,
-                                           Mask.getOperand(i).getValueType()));
-        } else {
-          MappedOps.push_back(DAG.getConstant(Idx + MaskNumElts - Src1NumElts,
-                                           Mask.getOperand(i).getValueType()));
-        } 
+        int Idx = cast<ConstantSDNode>(Mask.getOperand(i))->getZExtValue();
+        if (Idx < SrcNumElts)
+          MappedOps.push_back(DAG.getConstant(Idx, MaskEltVT));
+        else
+          MappedOps.push_back(DAG.getConstant(Idx + MaskNumElts - SrcNumElts,
+                                              MaskEltVT));
      }
    }
    Mask = DAG.getNode(ISD::BUILD_VECTOR, Mask.getValueType(),
                       &MappedOps[0], MappedOps.size());

-    setValue(&I, DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask));
+    setValue(&I, DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Srcs[0], Srcs[1], Mask));
    return;
  }

-  if (Src1NumElts > MaskNumElts) {
+  if (SrcNumElts > MaskNumElts) {
    // Resulting vector is shorter than the incoming vector.
-    if (Src1NumElts == MaskNumElts && SequentialMask(Mask,0)) {
+    if (SrcNumElts == MaskNumElts && SequentialMask(Mask,0)) {
      // Shuffle extracts 1st vector.
-      setValue(&I, V1);
+      setValue(&I, Srcs[0]);
      return;
    }

-    if (Src1NumElts == MaskNumElts && SequentialMask(Mask,MaskNumElts)) {
+    if (SrcNumElts == MaskNumElts && SequentialMask(Mask,MaskNumElts)) {
      // Shuffle extracts 2nd vector.
-      setValue(&I, V2);
+      setValue(&I, Srcs[1]);
      return;
    }

-    // Analyze the access pattern of the vector to see if we can extract each
-    // subvector and then do the shuffle. The analysis is done by calculating
-    // the range of elements the mask access on both vectors. If it is useful,
-    // we could do better by considering separate what elements are accessed
-    // in each vector (i.e., have min/max for each vector).
-    int MinRange = Src1NumElts+1;
-    int MaxRange = -1;
-    for (unsigned i = 0; i != MaskNumElts; ++i) {
+    // Analyze the access pattern of the vector to see if we can extract
+    // two subvectors and do the shuffle. The analysis is done by calculating
+    // the range of elements the mask access on both vectors.
+    int MinRange[2] = { SrcNumElts+1, SrcNumElts+1};
+    int MaxRange[2] = {-1, -1};
+
+    for (int i = 0; i != MaskNumElts; ++i) {
      SDValue Arg = Mask.getOperand(i);
      if (Arg.getOpcode() != ISD::UNDEF) {
        assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-        int Idx = cast<ConstantSDNode>(Mask.getOperand(i))->getZExtValue();
-        if (Idx > (int) Src1NumElts)
-          Idx -= Src1NumElts;
-        if (Idx > MaxRange)
-          MaxRange = Idx;
-        if (Idx < MinRange)
-          MinRange = Idx;
+        int Idx = cast<ConstantSDNode>(Arg)->getZExtValue();
+        int Input = 0;
+        if (Idx >= SrcNumElts) {
+          Input = 1;
+          Idx -= SrcNumElts;
+        }
+        if (Idx > MaxRange[Input])
+          MaxRange[Input] = Idx;
+        if (Idx < MinRange[Input])
+          MinRange[Input] = Idx;
      }
    }
-    // Adjust MinRange to start at an even boundary since this give us
-    // better quality splits later.
-    if ((unsigned) MinRange < Src1NumElts && MinRange%2 != 0)
-      MinRange = MinRange - 1;
-    if (MaxRange - MinRange < (int) MaskNumElts) {
-      // Extract subvector because the range is less than the new vector length
-      unsigned StartIdx = (MinRange/MaskNumElts)*MaskNumElts;
-      if (MaxRange - StartIdx < MaskNumElts) {
-        V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, VT, V1,
-                         DAG.getIntPtrConstant(MinRange));
-        V2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, VT, V2,
-                         DAG.getIntPtrConstant(MinRange));
-        // Readjust mask for new input vector length.
-        SmallVector<SDValue, 8> MappedOps;
-        for (unsigned i = 0; i != MaskNumElts; ++i) {
-          if (Mask.getOperand(i).getOpcode() == ISD::UNDEF) {
-            MappedOps.push_back(Mask.getOperand(i));
-          } else {
-            unsigned Idx =
-              cast<ConstantSDNode>(Mask.getOperand(i))->getZExtValue();
-            if (Idx < Src1NumElts) {
-              MappedOps.push_back(DAG.getConstant(Idx - StartIdx,
-                                         Mask.getOperand(i).getValueType()));
-            } else {
-              Idx = Idx - Src1NumElts - StartIdx + MaskNumElts;
-              MappedOps.push_back(DAG.getConstant(Idx,
-                                        Mask.getOperand(i).getValueType()));
-            } 
-          }
-        }
-        Mask = DAG.getNode(ISD::BUILD_VECTOR, Mask.getValueType(),
-                           &MappedOps[0], MappedOps.size());

-        setValue(&I, DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask));
-        return;
+    // Check if the access is smaller than the vector size and can we find
+    // a reasonable extract index.
+    int RangeUse[2];  // 0 = Unused, 1 = Extract, 2 = Can not Extract.
+    int StartIdx[2];  // StartIdx to extract from
+    for (int Input=0; Input < 2; ++Input) {
+      if (MinRange[Input] == SrcNumElts+1 && MaxRange[Input] == -1) {
+        RangeUse[Input] = 0; // Unused
+        StartIdx[Input] = 0;
+      } else if (MaxRange[Input] - MinRange[Input] < MaskNumElts) {
+        // Fits within range but we should see if we can find a good
+        // start index that a multiple of the mask length.
+        if (MaxRange[Input] < MaskNumElts) {
+          RangeUse[Input] = 1; // Extract from beginning of the vector
+          StartIdx[Input] = 0;
+        } else {
+          StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts;
+          if (MaxRange[Input] - StartIdx[Input] < MaskNumElts) 
+            RangeUse[Input] = 1; // Extract from a multiple of the mask length.
+          else
+            RangeUse[Input] = 2; // Can not extract
+        }
+      } else
+        RangeUse[Input] = 2;  // Access doesn't fit within range
+    }
+
+    if (RangeUse[0] == 0 && RangeUse[0] == 0) {
+      setValue(&I, DAG.getNode(ISD::UNDEF, VT));  // Vectors are not used.
+      return;
+    }
+    else if (RangeUse[0] < 2 && RangeUse[1] < 2) {
+      // Extract appropriate subvector and generate a vector shuffle
+      for (int Input=0; Input < 2; ++Input) {
+        if (RangeUse[Input] == 0) {
+          Srcs[Input] = DAG.getNode(ISD::UNDEF, VT);
+        } else {
+          Srcs[Input] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, VT, Srcs[Input],
+                                    DAG.getIntPtrConstant(StartIdx[Input]));
+        }
      }
+      // Calculate new mask.
+      SmallVector<SDValue, 8> MappedOps;
+      for (int i = 0; i != MaskNumElts; ++i) {
+        SDValue Arg = Mask.getOperand(i);
+        if (Arg.getOpcode() == ISD::UNDEF) {
+          MappedOps.push_back(Arg);
+        } else {
+          int Idx = cast<ConstantSDNode>(Arg)->getZExtValue();
+          if (Idx < SrcNumElts)
+            MappedOps.push_back(DAG.getConstant(Idx - StartIdx[0], MaskEltVT));
+          else {
+            Idx = Idx - SrcNumElts - StartIdx[1] + MaskNumElts;
+            MappedOps.push_back(DAG.getConstant(Idx, MaskEltVT));
+          } 
+        }
+      }
+      Mask = DAG.getNode(ISD::BUILD_VECTOR, Mask.getValueType(),
+                         &MappedOps[0], MappedOps.size());
+      setValue(&I, DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Srcs[0], Srcs[1], Mask));
+      return;
    }
  }

-  // We can't use either concat vectors or extract subvectors so we fall back
-  // to insert and extracts.
+  // We can't use either concat vectors or extract subvectors so fall back to
+  // replacing the shuffle with extract and build vector.
+  // to insert and build vector.
  MVT EltVT = VT.getVectorElementType();
  MVT PtrVT = TLI.getPointerTy();
  SmallVector<SDValue,8> Ops;
-  for (unsigned i = 0; i != MaskNumElts; ++i) {
+  for (int i = 0; i != MaskNumElts; ++i) {
    SDValue Arg = Mask.getOperand(i);
    if (Arg.getOpcode() == ISD::UNDEF) {
      Ops.push_back(DAG.getNode(ISD::UNDEF, EltVT));
    } else {
      assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
-      unsigned Idx = cast<ConstantSDNode>(Arg)->getZExtValue();
-      if (Idx < Src1NumElts)
-        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, EltVT, V1,
+      int Idx = cast<ConstantSDNode>(Arg)->getZExtValue();
+      if (Idx < SrcNumElts)
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, EltVT, Srcs[0],
                                  DAG.getConstant(Idx, PtrVT)));
      else
-        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, EltVT, V2,
-                                  DAG.getConstant(Idx - Src1NumElts, PtrVT)));
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, EltVT, Srcs[1],
+                                  DAG.getConstant(Idx - SrcNumElts, PtrVT)));
    }
  }
  setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, VT, &Ops[0], Ops.size()));
--- a/test/CodeGen/X86/vec_shuffle-25.ll
+++ b/test/CodeGen/X86/vec_shuffle-25.ll
@ -0,0 +1,34 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -o %t -f
+; RUN: grep unpcklps %t | count 3
+; RUN: grep unpckhps %t | count 1
+ 
+; Transpose example using the more generic vector shuffle.  We return
+; float8 instead of float16 since x86 can return that in register.
+; ModuleID = 'transpose2_opt.bc'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+target triple = "i386-apple-cl.1.0"
+@r0 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+@r1 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+@r2 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+@r3 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+
+define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
+entry:
+	%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
+	%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
+	%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
+	%unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
+	%unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
+	%unpcklps14a = shufflevector <4 x float> %unpcklps14,  <4 x float> undef,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>; 
+	%unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
+	%unpckhps17a = shufflevector <4 x float> %unpckhps17,  <4 x float> undef, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>; 	
+	%r1 = shufflevector <16 x float> %unpcklps14a,  <16 x float> %unpckhps17a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>; 
+	%unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
+	%unpcklps20a = shufflevector <4 x float> %unpcklps20,  <4 x float> undef,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>; 
+	%r2 = shufflevector <16 x float> %r1,  <16 x float> %unpcklps20a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>; 
+	%unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
+	%unpckhps23a = shufflevector <4 x float> %unpckhps23,  <4 x float> undef,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>; 
+	%r3 = shufflevector <16 x float> %r2,  <16 x float> %unpckhps23a, <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>; 
+	%r4 = shufflevector <16 x float> %r3,  <16 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>; 
+	ret <8 x float> %r4;
+}
--- a/test/CodeGen/X86/vec_shuffle-26.ll
+++ b/test/CodeGen/X86/vec_shuffle-26.ll
@ -0,0 +1,29 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -o %t -f
+; RUN: grep unpcklps %t | count 1
+; RUN: grep unpckhps %t | count 3
+
+; Transpose example using the more generic vector shuffle. Return float8
+; instead of float16
+; ModuleID = 'transpose2_opt.bc'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+target triple = "i386-apple-cl.1.0"
+@r0 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+@r1 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+@r2 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+@r3 = common global <4 x float> zeroinitializer, align 16		; <<4 x float>*> [#uses=1]
+
+define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
+entry:
+	%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
+	%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
+	%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
+	%unpckhps11 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
+	%unpcklps14 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
+	%unpckhps17 = shufflevector <4 x float> %unpcklps, <4 x float> %unpcklps8, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
+        %r1 = shufflevector <4 x float> %unpcklps14,  <4 x float> %unpckhps17,  <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >; 
+	%unpcklps20 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=1]
+	%unpckhps23 = shufflevector <4 x float> %unpckhps, <4 x float> %unpckhps11, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=1]
+        %r2 = shufflevector <4 x float> %unpcklps20,  <4 x float> %unpckhps23,  <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7 >;
+;       %r3 = shufflevector <8 x float> %r1,  <8 x float> %r2,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >; 
+	ret <8 x float> %r2;
+}
--- a/test/CodeGen/X86/vec_shuffle-27.ll
+++ b/test/CodeGen/X86/vec_shuffle-27.ll
@ -0,0 +1,17 @@
+; RUN: llvm-as < %s | llc -march=x86 -mattr=sse41 -o %t -f
+; RUN: grep addps %t | count 2
+; RUN: grep mulps %t | count 2
+; RUN: grep subps %t | count 2
+
+; ModuleID = 'vec_shuffle-27.bc'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
+target triple = "i686-apple-cl.1.0"
+
+define <8 x float> @my2filter4_1d(<4 x float> %a, <8 x float> %T0, <8 x float> %T1) nounwind readnone {
+entry:
+	%tmp7 = shufflevector <4 x float> %a, <4 x float> undef, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3 >		; <<8 x float>> [#uses=1]
+	%sub = sub <8 x float> %T1, %T0		; <<8 x float>> [#uses=1]
+	%mul = mul <8 x float> %sub, %tmp7		; <<8 x float>> [#uses=1]
+	%add = add <8 x float> %mul, %T0		; <<8 x float>> [#uses=1]
+	ret <8 x float> %add
+}