mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 11:13:28 +01:00
Optimization for certain shufflevector by using insertps.
Summary: If we're doing a v4f32/v4i32 shuffle on x86 with SSE4.1, we can lower certain shufflevectors to an insertps instruction: When most of the shufflevector result's elements come from one vector (and keep their index), and one element comes from another vector or a memory operand. Added tests for insertps optimizations on shufflevector. Added support and tests for v4i32 vector optimization. Reviewers: nadav Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D3475 llvm-svn: 207291
This commit is contained in:
parent
c54b3a7e23
commit
54c5ad74d7
@ -3931,6 +3931,29 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
|
||||
/// specifies a shuffle of elements that is suitable for input to INSERTPS.
|
||||
/// i. e: If all but one element come from the same vector.
|
||||
static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
|
||||
// TODO: Deal with AVX's VINSERTPS
|
||||
if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
|
||||
return false;
|
||||
|
||||
unsigned CorrectPosV1 = 0;
|
||||
unsigned CorrectPosV2 = 0;
|
||||
for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
|
||||
if (Mask[i] == i)
|
||||
++CorrectPosV1;
|
||||
else if (Mask[i] == i + 4)
|
||||
++CorrectPosV2;
|
||||
|
||||
if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
|
||||
// We have 3 elements from one vector, and one from another.
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
//
|
||||
// Some special combinations that can be optimized.
|
||||
//
|
||||
@ -7263,6 +7286,84 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
|
||||
getShuffleSHUFImmediate(SVOp), DAG);
|
||||
}
|
||||
|
||||
// It is only safe to call this function if isINSERTPSMask is true for
|
||||
// this shufflevector mask.
|
||||
static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
|
||||
SelectionDAG &DAG) {
|
||||
// Generate an insertps instruction when inserting an f32 from memory onto a
|
||||
// v4f32 or when copying a member from one v4f32 to another.
|
||||
// We also use it for transferring i32 from one register to another,
|
||||
// since it simply copies the same bits.
|
||||
// If we're transfering an i32 from memory to a specific element in a
|
||||
// register, we output a generic DAG that will match the PINSRD
|
||||
// instruction.
|
||||
// TODO: Optimize for AVX cases too (VINSERTPS)
|
||||
MVT VT = SVOp->getSimpleValueType(0);
|
||||
MVT EVT = VT.getVectorElementType();
|
||||
SDValue V1 = SVOp->getOperand(0);
|
||||
SDValue V2 = SVOp->getOperand(1);
|
||||
auto Mask = SVOp->getMask();
|
||||
assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
|
||||
"unsupported vector type for insertps/pinsrd");
|
||||
|
||||
int FromV1 = std::count_if(Mask.begin(), Mask.end(),
|
||||
[](const int &i) { return i < 4; });
|
||||
|
||||
SDValue From;
|
||||
SDValue To;
|
||||
unsigned DestIndex;
|
||||
if (FromV1 == 1) {
|
||||
From = V1;
|
||||
To = V2;
|
||||
DestIndex = std::find_if(Mask.begin(), Mask.end(),
|
||||
[](const int &i) { return i < 4; }) -
|
||||
Mask.begin();
|
||||
} else {
|
||||
From = V2;
|
||||
To = V1;
|
||||
DestIndex = std::find_if(Mask.begin(), Mask.end(),
|
||||
[](const int &i) { return i >= 4; }) -
|
||||
Mask.begin();
|
||||
}
|
||||
|
||||
if (MayFoldLoad(From)) {
|
||||
// Trivial case, when From comes from a load and is only used by the
|
||||
// shuffle. Make it use insertps from the vector that we need from that
|
||||
// load.
|
||||
SDValue Addr = From.getOperand(1);
|
||||
SDValue NewAddr =
|
||||
DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
|
||||
DAG.getConstant(DestIndex * EVT.getStoreSize(),
|
||||
Addr.getSimpleValueType()));
|
||||
|
||||
LoadSDNode *Load = cast<LoadSDNode>(From);
|
||||
SDValue NewLoad =
|
||||
DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
|
||||
DAG.getMachineFunction().getMachineMemOperand(
|
||||
Load->getMemOperand(), 0, EVT.getStoreSize()));
|
||||
|
||||
if (EVT == MVT::f32) {
|
||||
// Create this as a scalar to vector to match the instruction pattern.
|
||||
SDValue LoadScalarToVector =
|
||||
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
|
||||
SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
|
||||
return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
|
||||
InsertpsMask);
|
||||
} else { // EVT == MVT::i32
|
||||
// If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
|
||||
// instruction, to match the PINSRD instruction, which loads an i32 to a
|
||||
// certain vector element.
|
||||
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
|
||||
DAG.getConstant(DestIndex, MVT::i32));
|
||||
}
|
||||
}
|
||||
|
||||
// Vector-element-to-vector
|
||||
unsigned SrcIndex = Mask[DestIndex] % 4;
|
||||
SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
|
||||
return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
|
||||
}
|
||||
|
||||
// Reduce a vector shuffle to zext.
|
||||
static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
@ -7674,6 +7775,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
|
||||
if (BlendOp.getNode())
|
||||
return BlendOp;
|
||||
|
||||
if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
|
||||
return getINSERTPS(SVOp, dl, DAG);
|
||||
|
||||
unsigned Imm8;
|
||||
if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
|
||||
return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
|
||||
|
@ -1,5 +1,5 @@
|
||||
; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64
|
||||
; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
|
||||
; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
|
||||
|
||||
@g16 = external global i16
|
||||
|
||||
@ -249,3 +249,74 @@ entry:
|
||||
; X64: ret
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
|
||||
entry:
|
||||
%0 = load <4 x float>* %pb, align 16
|
||||
%vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
|
||||
ret <4 x float> %vecinit6
|
||||
; CHECK-LABEL: insertps_from_shufflevector_1:
|
||||
; CHECK-NOT: shufps
|
||||
; CHECK: insertps $48,
|
||||
; CHECK: ret
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
|
||||
entry:
|
||||
%vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
|
||||
ret <4 x float> %vecinit6
|
||||
; CHECK-LABEL: insertps_from_shufflevector_2:
|
||||
; CHECK-NOT: mov
|
||||
; CHECK-NOT: shufps
|
||||
; CHECK: insertps $96,
|
||||
; CHECK: ret
|
||||
}
|
||||
|
||||
; For loading an i32 from memory into an xmm register we use pinsrd
|
||||
; instead of insertps
|
||||
define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
|
||||
entry:
|
||||
%0 = load <4 x i32>* %pb, align 16
|
||||
%vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
|
||||
ret <4 x i32> %vecinit6
|
||||
; CHECK-LABEL: pinsrd_from_shufflevector_i32:
|
||||
; CHECK-NOT: mov
|
||||
; CHECK-NOT: shufps
|
||||
; CHECK: pinsrd $3,
|
||||
; CHECK: ret
|
||||
}
|
||||
|
||||
define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
|
||||
entry:
|
||||
%vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
|
||||
ret <4 x i32> %vecinit6
|
||||
; CHECK-LABEL: insertps_from_shufflevector_i32_2:
|
||||
; CHECK-NOT: mov
|
||||
; CHECK-NOT: shufps
|
||||
; CHECK: insertps $208,
|
||||
; CHECK: ret
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
|
||||
; CHECK-LABEL: insertps_from_load_ins_elt_undef:
|
||||
; CHECK-NOT: mov
|
||||
; CHECK-NOT: shufps
|
||||
; CHECK: insertps $16,
|
||||
; CHECK: ret
|
||||
%1 = load float* %b, align 4
|
||||
%2 = insertelement <4 x float> undef, float %1, i32 0
|
||||
%result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
|
||||
ret <4 x float> %result
|
||||
}
|
||||
|
||||
define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
|
||||
; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32:
|
||||
; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
|
||||
;; aCHECK-NOT: mov
|
||||
; CHECK-NOT: shufps
|
||||
; CHECK: insertps $32,
|
||||
; CHECK: ret
|
||||
%1 = load i32* %b, align 4
|
||||
%2 = insertelement <4 x i32> undef, i32 %1, i32 0
|
||||
%result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
|
||||
ret <4 x i32> %result
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user