mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 19:23:23 +01:00
Optimization for certain shufflevector by using insertps.
Summary: If we're doing a v4f32/v4i32 shuffle on x86 with SSE4.1, we can lower certain shufflevectors to an insertps instruction: When most of the shufflevector result's elements come from one vector (and keep their index), and one element comes from another vector or a memory operand. Added tests for insertps optimizations on shufflevector. Added support and tests for v4i32 vector optimization. Reviewers: nadav Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D3475 llvm-svn: 207291
This commit is contained in:
parent
c54b3a7e23
commit
54c5ad74d7
@ -3931,6 +3931,29 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
|
||||||
|
/// specifies a shuffle of elements that is suitable for input to INSERTPS.
|
||||||
|
/// i. e: If all but one element come from the same vector.
|
||||||
|
static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
|
||||||
|
// TODO: Deal with AVX's VINSERTPS
|
||||||
|
if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
unsigned CorrectPosV1 = 0;
|
||||||
|
unsigned CorrectPosV2 = 0;
|
||||||
|
for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
|
||||||
|
if (Mask[i] == i)
|
||||||
|
++CorrectPosV1;
|
||||||
|
else if (Mask[i] == i + 4)
|
||||||
|
++CorrectPosV2;
|
||||||
|
|
||||||
|
if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
|
||||||
|
// We have 3 elements from one vector, and one from another.
|
||||||
|
return true;
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Some special combinations that can be optimized.
|
// Some special combinations that can be optimized.
|
||||||
//
|
//
|
||||||
@ -7263,6 +7286,84 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
|
|||||||
getShuffleSHUFImmediate(SVOp), DAG);
|
getShuffleSHUFImmediate(SVOp), DAG);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// It is only safe to call this function if isINSERTPSMask is true for
|
||||||
|
// this shufflevector mask.
|
||||||
|
static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
|
||||||
|
SelectionDAG &DAG) {
|
||||||
|
// Generate an insertps instruction when inserting an f32 from memory onto a
|
||||||
|
// v4f32 or when copying a member from one v4f32 to another.
|
||||||
|
// We also use it for transferring i32 from one register to another,
|
||||||
|
// since it simply copies the same bits.
|
||||||
|
// If we're transfering an i32 from memory to a specific element in a
|
||||||
|
// register, we output a generic DAG that will match the PINSRD
|
||||||
|
// instruction.
|
||||||
|
// TODO: Optimize for AVX cases too (VINSERTPS)
|
||||||
|
MVT VT = SVOp->getSimpleValueType(0);
|
||||||
|
MVT EVT = VT.getVectorElementType();
|
||||||
|
SDValue V1 = SVOp->getOperand(0);
|
||||||
|
SDValue V2 = SVOp->getOperand(1);
|
||||||
|
auto Mask = SVOp->getMask();
|
||||||
|
assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
|
||||||
|
"unsupported vector type for insertps/pinsrd");
|
||||||
|
|
||||||
|
int FromV1 = std::count_if(Mask.begin(), Mask.end(),
|
||||||
|
[](const int &i) { return i < 4; });
|
||||||
|
|
||||||
|
SDValue From;
|
||||||
|
SDValue To;
|
||||||
|
unsigned DestIndex;
|
||||||
|
if (FromV1 == 1) {
|
||||||
|
From = V1;
|
||||||
|
To = V2;
|
||||||
|
DestIndex = std::find_if(Mask.begin(), Mask.end(),
|
||||||
|
[](const int &i) { return i < 4; }) -
|
||||||
|
Mask.begin();
|
||||||
|
} else {
|
||||||
|
From = V2;
|
||||||
|
To = V1;
|
||||||
|
DestIndex = std::find_if(Mask.begin(), Mask.end(),
|
||||||
|
[](const int &i) { return i >= 4; }) -
|
||||||
|
Mask.begin();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (MayFoldLoad(From)) {
|
||||||
|
// Trivial case, when From comes from a load and is only used by the
|
||||||
|
// shuffle. Make it use insertps from the vector that we need from that
|
||||||
|
// load.
|
||||||
|
SDValue Addr = From.getOperand(1);
|
||||||
|
SDValue NewAddr =
|
||||||
|
DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
|
||||||
|
DAG.getConstant(DestIndex * EVT.getStoreSize(),
|
||||||
|
Addr.getSimpleValueType()));
|
||||||
|
|
||||||
|
LoadSDNode *Load = cast<LoadSDNode>(From);
|
||||||
|
SDValue NewLoad =
|
||||||
|
DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
|
||||||
|
DAG.getMachineFunction().getMachineMemOperand(
|
||||||
|
Load->getMemOperand(), 0, EVT.getStoreSize()));
|
||||||
|
|
||||||
|
if (EVT == MVT::f32) {
|
||||||
|
// Create this as a scalar to vector to match the instruction pattern.
|
||||||
|
SDValue LoadScalarToVector =
|
||||||
|
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
|
||||||
|
SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
|
||||||
|
return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
|
||||||
|
InsertpsMask);
|
||||||
|
} else { // EVT == MVT::i32
|
||||||
|
// If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
|
||||||
|
// instruction, to match the PINSRD instruction, which loads an i32 to a
|
||||||
|
// certain vector element.
|
||||||
|
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
|
||||||
|
DAG.getConstant(DestIndex, MVT::i32));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Vector-element-to-vector
|
||||||
|
unsigned SrcIndex = Mask[DestIndex] % 4;
|
||||||
|
SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
|
||||||
|
return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
|
||||||
|
}
|
||||||
|
|
||||||
// Reduce a vector shuffle to zext.
|
// Reduce a vector shuffle to zext.
|
||||||
static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
|
static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
|
||||||
SelectionDAG &DAG) {
|
SelectionDAG &DAG) {
|
||||||
@ -7674,6 +7775,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
|
|||||||
if (BlendOp.getNode())
|
if (BlendOp.getNode())
|
||||||
return BlendOp;
|
return BlendOp;
|
||||||
|
|
||||||
|
if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
|
||||||
|
return getINSERTPS(SVOp, dl, DAG);
|
||||||
|
|
||||||
unsigned Imm8;
|
unsigned Imm8;
|
||||||
if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
|
if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
|
||||||
return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
|
return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32
|
; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
|
||||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
|
||||||
|
|
||||||
@g16 = external global i16
|
@g16 = external global i16
|
||||||
|
|
||||||
@ -249,3 +249,74 @@ entry:
|
|||||||
; X64: ret
|
; X64: ret
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
|
||||||
|
entry:
|
||||||
|
%0 = load <4 x float>* %pb, align 16
|
||||||
|
%vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
|
||||||
|
ret <4 x float> %vecinit6
|
||||||
|
; CHECK-LABEL: insertps_from_shufflevector_1:
|
||||||
|
; CHECK-NOT: shufps
|
||||||
|
; CHECK: insertps $48,
|
||||||
|
; CHECK: ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
|
||||||
|
entry:
|
||||||
|
%vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
|
||||||
|
ret <4 x float> %vecinit6
|
||||||
|
; CHECK-LABEL: insertps_from_shufflevector_2:
|
||||||
|
; CHECK-NOT: mov
|
||||||
|
; CHECK-NOT: shufps
|
||||||
|
; CHECK: insertps $96,
|
||||||
|
; CHECK: ret
|
||||||
|
}
|
||||||
|
|
||||||
|
; For loading an i32 from memory into an xmm register we use pinsrd
|
||||||
|
; instead of insertps
|
||||||
|
define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
|
||||||
|
entry:
|
||||||
|
%0 = load <4 x i32>* %pb, align 16
|
||||||
|
%vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
|
||||||
|
ret <4 x i32> %vecinit6
|
||||||
|
; CHECK-LABEL: pinsrd_from_shufflevector_i32:
|
||||||
|
; CHECK-NOT: mov
|
||||||
|
; CHECK-NOT: shufps
|
||||||
|
; CHECK: pinsrd $3,
|
||||||
|
; CHECK: ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
|
||||||
|
entry:
|
||||||
|
%vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
|
||||||
|
ret <4 x i32> %vecinit6
|
||||||
|
; CHECK-LABEL: insertps_from_shufflevector_i32_2:
|
||||||
|
; CHECK-NOT: mov
|
||||||
|
; CHECK-NOT: shufps
|
||||||
|
; CHECK: insertps $208,
|
||||||
|
; CHECK: ret
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
|
||||||
|
; CHECK-LABEL: insertps_from_load_ins_elt_undef:
|
||||||
|
; CHECK-NOT: mov
|
||||||
|
; CHECK-NOT: shufps
|
||||||
|
; CHECK: insertps $16,
|
||||||
|
; CHECK: ret
|
||||||
|
%1 = load float* %b, align 4
|
||||||
|
%2 = insertelement <4 x float> undef, float %1, i32 0
|
||||||
|
%result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
|
||||||
|
ret <4 x float> %result
|
||||||
|
}
|
||||||
|
|
||||||
|
define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
|
||||||
|
; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32:
|
||||||
|
; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
|
||||||
|
;; aCHECK-NOT: mov
|
||||||
|
; CHECK-NOT: shufps
|
||||||
|
; CHECK: insertps $32,
|
||||||
|
; CHECK: ret
|
||||||
|
%1 = load i32* %b, align 4
|
||||||
|
%2 = insertelement <4 x i32> undef, i32 %1, i32 0
|
||||||
|
%result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
|
||||||
|
ret <4 x i32> %result
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user