Optimization for certain shufflevector by using insertps.

Summary: If we're doing a v4f32/v4i32 shuffle on x86 with SSE4.1, we can lower certain shufflevectors to an insertps instruction: When most of the shufflevector result's elements come from one vector (and keep their index), and one element comes from another vector or a memory operand. Added tests for insertps optimizations on shufflevector. Added support and tests for v4i32 vector optimization. Reviewers: nadav Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D3475 llvm-svn: 207291
2024-11-23 19:23:23 +01:00 · 2014-04-25 23:51:17 +00:00 · 2014-04-25 23:51:17 +00:00 · 54c5ad74d7
commit 54c5ad74d7
parent c54b3a7e23
2 changed files with 177 additions and 2 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -3931,6 +3931,29 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
  return true;
 }
 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
 /// i. e: If all but one element come from the same vector.
 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
  // TODO: Deal with AVX's VINSERTPS
  if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
    return false;
  unsigned CorrectPosV1 = 0;
  unsigned CorrectPosV2 = 0;
  for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
    if (Mask[i] == i)
      ++CorrectPosV1;
    else if (Mask[i] == i + 4)
      ++CorrectPosV2;
  if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
    // We have 3 elements from one vector, and one from another.
    return true;
  return false;
 }
 //
 // Some special combinations that can be optimized.
 //
@ -7263,6 +7286,84 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
                              getShuffleSHUFImmediate(SVOp), DAG);
 }
 // It is only safe to call this function if isINSERTPSMask is true for
 // this shufflevector mask.
 static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
                           SelectionDAG &DAG) {
  // Generate an insertps instruction when inserting an f32 from memory onto a
  // v4f32 or when copying a member from one v4f32 to another.
  // We also use it for transferring i32 from one register to another,
  // since it simply copies the same bits.
  // If we're transfering an i32 from memory to a specific element in a
  // register, we output a generic DAG that will match the PINSRD
  // instruction.
  // TODO: Optimize for AVX cases too (VINSERTPS)
  MVT VT = SVOp->getSimpleValueType(0);
  MVT EVT = VT.getVectorElementType();
  SDValue V1 = SVOp->getOperand(0);
  SDValue V2 = SVOp->getOperand(1);
  auto Mask = SVOp->getMask();
  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
         "unsupported vector type for insertps/pinsrd");
  int FromV1 = std::count_if(Mask.begin(), Mask.end(),
                             [](const int &i) { return i < 4; });
  SDValue From;
  SDValue To;
  unsigned DestIndex;
  if (FromV1 == 1) {
    From = V1;
    To = V2;
    DestIndex = std::find_if(Mask.begin(), Mask.end(),
                             [](const int &i) { return i < 4; }) -
                Mask.begin();
  } else {
    From = V2;
    To = V1;
    DestIndex = std::find_if(Mask.begin(), Mask.end(),
                             [](const int &i) { return i >= 4; }) -
                Mask.begin();
  }
  if (MayFoldLoad(From)) {
    // Trivial case, when From comes from a load and is only used by the
    // shuffle. Make it use insertps from the vector that we need from that
    // load.
    SDValue Addr = From.getOperand(1);
    SDValue NewAddr =
        DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
                    DAG.getConstant(DestIndex * EVT.getStoreSize(),
                                    Addr.getSimpleValueType()));
    LoadSDNode *Load = cast<LoadSDNode>(From);
    SDValue NewLoad =
        DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
                    DAG.getMachineFunction().getMachineMemOperand(
                        Load->getMemOperand(), 0, EVT.getStoreSize()));
    if (EVT == MVT::f32) {
      // Create this as a scalar to vector to match the instruction pattern.
      SDValue LoadScalarToVector =
          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
      SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
      return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
                         InsertpsMask);
    } else { // EVT == MVT::i32
      // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
      // instruction, to match the PINSRD instruction, which loads an i32 to a
      // certain vector element.
      return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
                         DAG.getConstant(DestIndex, MVT::i32));
    }
  }
  // Vector-element-to-vector
  unsigned SrcIndex = Mask[DestIndex] % 4;
  SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
  return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
 }
 // Reduce a vector shuffle to zext.
 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
                                    SelectionDAG &DAG) {
@ -7674,6 +7775,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
  if (BlendOp.getNode())
    return BlendOp;
  if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
    return getINSERTPS(SVOp, dl, DAG);
  unsigned Imm8;
  if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
@g16 = external global i16
@ -249,3 +249,74 @@ entry:
 ; X64: ret
 }
 define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
 entry:
  %0 = load <4 x float>* %pb, align 16
  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
  ret <4 x float> %vecinit6
 ; CHECK-LABEL: insertps_from_shufflevector_1:
 ; CHECK-NOT: shufps
 ; CHECK: insertps    $48,
 ; CHECK: ret
 }
 define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
 entry:
  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
  ret <4 x float> %vecinit6
 ; CHECK-LABEL: insertps_from_shufflevector_2:
 ; CHECK-NOT: mov
 ; CHECK-NOT: shufps
 ; CHECK: insertps    $96,
 ; CHECK: ret
 }
 ; For loading an i32 from memory into an xmm register we use pinsrd
 ; instead of insertps
 define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
 entry:
  %0 = load <4 x i32>* %pb, align 16
  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
  ret <4 x i32> %vecinit6
 ; CHECK-LABEL: pinsrd_from_shufflevector_i32:
 ; CHECK-NOT: mov
 ; CHECK-NOT: shufps
 ; CHECK: pinsrd  $3,
 ; CHECK: ret
 }
 define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
 entry:
  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
  ret <4 x i32> %vecinit6
 ; CHECK-LABEL: insertps_from_shufflevector_i32_2:
 ; CHECK-NOT: mov
 ; CHECK-NOT: shufps
 ; CHECK: insertps    $208,
 ; CHECK: ret
 }
 define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
 ; CHECK-LABEL: insertps_from_load_ins_elt_undef:
 ; CHECK-NOT: mov
 ; CHECK-NOT: shufps
 ; CHECK: insertps    $16,
 ; CHECK: ret
  %1 = load float* %b, align 4
  %2 = insertelement <4 x float> undef, float %1, i32 0
  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
  ret <4 x float> %result
 }
 define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
 ; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32:
 ; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
 ;; aCHECK-NOT: mov
 ; CHECK-NOT: shufps
 ; CHECK: insertps    $32,
 ; CHECK: ret
  %1 = load i32* %b, align 4
  %2 = insertelement <4 x i32> undef, i32 %1, i32 0
  %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
  ret <4 x i32> %result
 }