diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f2e36cdb343..0df66128b82 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -13316,12 +13316,10 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef Mask, /// It makes no assumptions about whether this is the *best* lowering, it simply /// uses it. static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, - ArrayRef OriginalMask, SDValue V1, + ArrayRef Mask, SDValue V1, SDValue V2, SelectionDAG &DAG) { SDValue LowV = V1, HighV = V2; - SmallVector Mask(OriginalMask.begin(), OriginalMask.end()); - SmallVector NewMask = Mask; - + SmallVector NewMask(Mask.begin(), Mask.end()); int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); if (NumV2Elements == 1) { @@ -13358,14 +13356,6 @@ static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. } } else if (NumV2Elements == 2) { - // If we are likely to fold V1 but not V2, then commute the shuffle. - if (MayFoldLoad(V1) && !MayFoldLoad(V2)) { - ShuffleVectorSDNode::commuteMask(Mask); - NewMask = Mask; - std::swap(V1, V2); - std::swap(LowV, HighV); - } - if (Mask[0] < 4 && Mask[1] < 4) { // Handle the easy case where we have V1 in the low lanes and V2 in the // high lanes. @@ -34598,6 +34588,28 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, } } + // Attempt to commute shufps LHS loads: + // permilps(shufps(load(),x)) --> permilps(shufps(x,load())) + if (VT == MVT::v4f32 && + (X86ISD::VPERMILPI == Opcode || + (X86ISD::SHUFP == Opcode && N.getOperand(0) == N.getOperand(1)))) { + SDValue N0 = N.getOperand(0); + unsigned Imm = N.getConstantOperandVal(X86ISD::VPERMILPI == Opcode ? 1 : 2); + if (N0.getOpcode() == X86ISD::SHUFP && N->isOnlyUserOf(N0.getNode())) { + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + if (MayFoldLoad(peekThroughOneUseBitcasts(N00)) && + !MayFoldLoad(peekThroughOneUseBitcasts(N01))) { + unsigned Imm1 = N0.getConstantOperandVal(2); + Imm1 = ((Imm1 & 0x0F) << 4) | ((Imm1 & 0xF0) >> 4); + SDValue NewN0 = DAG.getNode(X86ISD::SHUFP, DL, VT, N01, N00, + DAG.getTargetConstant(Imm1, DL, MVT::i8)); + return DAG.getNode(X86ISD::SHUFP, DL, VT, NewN0, NewN0, + DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8)); + } + } + } + switch (Opcode) { case X86ISD::VBROADCAST: { SDValue Src = N.getOperand(0); diff --git a/test/CodeGen/X86/insertelement-duplicates.ll b/test/CodeGen/X86/insertelement-duplicates.ll index 2f32c5a2e6b..3f693728e6f 100644 --- a/test/CodeGen/X86/insertelement-duplicates.ll +++ b/test/CodeGen/X86/insertelement-duplicates.ll @@ -9,22 +9,22 @@ define void @PR15298(<4 x float>* nocapture %source, <8 x float>* nocapture %des ; SSE-32: # %bb.0: # %L.entry ; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; SSE-32-NEXT: movaps 304(%ecx), %xmm0 +; SSE-32-NEXT: xorps %xmm0, %xmm0 ; SSE-32-NEXT: xorps %xmm1, %xmm1 -; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1] -; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-32-NEXT: movups %xmm1, 624(%eax) -; SSE-32-NEXT: movups %xmm0, 608(%eax) +; SSE-32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,0] +; SSE-32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; SSE-32-NEXT: movups %xmm0, 624(%eax) +; SSE-32-NEXT: movups %xmm1, 608(%eax) ; SSE-32-NEXT: retl ; ; SSE-64-LABEL: PR15298: ; SSE-64: # %bb.0: # %L.entry -; SSE-64-NEXT: movaps 304(%rdi), %xmm0 +; SSE-64-NEXT: xorps %xmm0, %xmm0 ; SSE-64-NEXT: xorps %xmm1, %xmm1 -; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1] -; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3] -; SSE-64-NEXT: movups %xmm1, 624(%rsi) -; SSE-64-NEXT: movups %xmm0, 608(%rsi) +; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,0] +; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1] +; SSE-64-NEXT: movups %xmm0, 624(%rsi) +; SSE-64-NEXT: movups %xmm1, 608(%rsi) ; SSE-64-NEXT: retq ; ; AVX-32-LABEL: PR15298: