1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-02-01 05:01:59 +01:00

[X86] Improve lowering of vXi1 insert_subvectors to better utilize (insert_subvector zero, vec, 0) for zeroing upper bits.

This can be better recognized during isel when the producer already zeroed the upper bits.

llvm-svn: 320267
This commit is contained in:
Craig Topper 2017-12-09 22:44:42 +00:00
parent 4f4608b667
commit 4d36bf76a1
2 changed files with 101 additions and 89 deletions

View File

@ -5013,6 +5013,10 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
if (!isa<ConstantSDNode>(Idx))
return SDValue();
// Inserting undef is a nop. We can just return the original vector.
if (SubVec.isUndef())
return Vec;
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
return Op;
@ -5020,19 +5024,21 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
MVT OpVT = Op.getSimpleValueType();
unsigned NumElems = OpVT.getVectorNumElements();
SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
// Extend to natively supported kshift.
MVT WideOpVT = OpVT;
if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
// Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
// if necessary.
if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
if ((!Subtarget.hasDQI() && NumElems == 8) || (NumElems < 8)) {
// Need to promote to v16i1, do the insert, then extract back.
Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
getZeroVector(MVT::v16i1, Subtarget, DAG, dl),
SubVec, Idx);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op,
DAG.getIntPtrConstant(0, dl));
}
return Op;
// May need to promote to a legal type.
Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
getZeroVector(WideOpVT, Subtarget, DAG, dl),
SubVec, Idx);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
MVT SubVecVT = SubVec.getSimpleValueType();
@ -5042,30 +5048,32 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR");
// extend to natively supported kshift
MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
MVT WideOpVT = OpVT;
if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
WideOpVT = MinVT;
SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
SDValue Undef = DAG.getUNDEF(WideOpVT);
SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
Undef, SubVec, ZeroIdx);
// Extract sub-vector if require.
auto ExtractSubVec = [&](SDValue V) {
return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
OpVT, V, ZeroIdx);
};
if (IdxVal == 0) {
// Zero lower bits of the Vec
SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
ZeroIdx);
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
// Merge them together, SubVec should be zero extended.
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
getZeroVector(WideOpVT, Subtarget, DAG, dl),
SubVec, ZeroIdx);
Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op,
ZeroIdx);
}
SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
Undef, SubVec, ZeroIdx);
if (Vec.isUndef()) {
if (IdxVal != 0) {
SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
ShiftBits);
}
return ExtractSubVec(WideSubVec);
assert(IdxVal != 0 && "Unexpected index");
Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getConstant(IdxVal, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
@ -5073,48 +5081,60 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
NumElems = WideOpVT.getVectorNumElements();
unsigned ShiftLeft = NumElems - SubVecNumElems;
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
DAG.getConstant(ShiftLeft, dl, MVT::i8));
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
DAG.getConstant(ShiftRight, dl, MVT::i8));
return ExtractSubVec(Vec);
}
if (IdxVal == 0) {
// Zero lower bits of the Vec
SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
// Merge them together, SubVec should be zero extended.
WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
getZeroVector(WideOpVT, Subtarget, DAG, dl),
SubVec, ZeroIdx);
Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
return ExtractSubVec(Vec);
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getConstant(ShiftLeft, dl, MVT::i8));
Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
DAG.getConstant(ShiftRight, dl, MVT::i8));
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
// Simple case when we put subvector in the upper part
if (IdxVal + SubVecNumElems == NumElems) {
// Zero upper bits of the Vec
WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
DAG.getConstant(IdxVal, dl, MVT::i8));
NumElems = WideOpVT.getVectorNumElements();
SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
return ExtractSubVec(Vec);
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getConstant(IdxVal, dl, MVT::i8));
if (SubVecNumElems * 2 == NumElems) {
// Special case, use legal zero extending insert_subvector. This allows
// isel to opimitize when bits are known zero.
Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
getZeroVector(WideOpVT, Subtarget, DAG, dl),
Vec, ZeroIdx);
} else {
// Otherwise use explicit shifts to zero the bits.
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
Undef, Vec, ZeroIdx);
NumElems = WideOpVT.getVectorNumElements();
SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
}
Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
// Subvector should be inserted in the middle - use shuffle
WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
SubVec, ZeroIdx);
SmallVector<int, 64> Mask;
for (unsigned i = 0; i < NumElems; ++i)
Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
i : i + NumElems);
return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
// Inserting into the middle is more complicated.
NumElems = WideOpVT.getVectorNumElements();
// Widen the vector if needed.
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
// Move the current value of the bit to be replace to the lsbs.
Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
DAG.getConstant(IdxVal, dl, MVT::i8));
// Xor with the new bit.
Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
// Shift to MSB, filling bottom bits with 0.
unsigned ShiftLeft = NumElems - SubVecNumElems;
Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
DAG.getConstant(ShiftLeft, dl, MVT::i8));
// Shift to the final position, filling upper bits with 0.
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
DAG.getConstant(ShiftRight, dl, MVT::i8));
// Xor with original vector leaving the new value.
Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
// Reduce to original width if needed.
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128

View File

@ -56,14 +56,12 @@ define <8 x i1> @test3(<4 x i1> %a) {
define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) {
; CHECK-LABEL: test4:
; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k0
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
; CHECK-NEXT: vpslld $31, %xmm1, %xmm0
; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1
; CHECK-NEXT: kshiftlb $4, %k1, %k1
; CHECK-NEXT: kshiftlb $4, %k0, %k0
; CHECK-NEXT: kshiftrb $4, %k0, %k0
; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: korb %k0, %k1, %k0
; CHECK-NEXT: vpmovm2w %k0, %xmm0
; CHECK-NEXT: retq
@ -74,14 +72,12 @@ define <8 x i1> @test4(<4 x i1> %a, <4 x i1>%b) {
define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) {
; CHECK-LABEL: test5:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsllq $63, %xmm1, %xmm1
; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k0
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
; CHECK-NEXT: vpsllq $63, %xmm1, %xmm0
; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1
; CHECK-NEXT: kshiftlb $2, %k1, %k1
; CHECK-NEXT: kshiftlb $6, %k0, %k0
; CHECK-NEXT: kshiftrb $6, %k0, %k0
; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: kshiftlb $2, %k0, %k0
; CHECK-NEXT: korb %k0, %k1, %k0
; CHECK-NEXT: vpmovm2d %k0, %xmm0
; CHECK-NEXT: retq
@ -92,14 +88,12 @@ define <4 x i1> @test5(<2 x i1> %a, <2 x i1>%b) {
define <16 x i1> @test6(<2 x i1> %a, <2 x i1>%b) {
; CHECK-LABEL: test6:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsllq $63, %xmm1, %xmm1
; CHECK-NEXT: vptestmq %xmm1, %xmm1, %k0
; CHECK-NEXT: vpsllq $63, %xmm0, %xmm0
; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k0
; CHECK-NEXT: vpsllq $63, %xmm1, %xmm0
; CHECK-NEXT: vptestmq %xmm0, %xmm0, %k1
; CHECK-NEXT: kshiftlb $2, %k1, %k1
; CHECK-NEXT: kshiftlb $6, %k0, %k0
; CHECK-NEXT: kshiftrb $6, %k0, %k0
; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: kshiftlb $2, %k0, %k0
; CHECK-NEXT: korb %k0, %k1, %k0
; CHECK-NEXT: vpmovm2b %k0, %xmm0
; CHECK-NEXT: retq
@ -110,14 +104,12 @@ define <16 x i1> @test6(<2 x i1> %a, <2 x i1>%b) {
define <32 x i1> @test7(<4 x i1> %a, <4 x i1>%b) {
; CHECK-LABEL: test7:
; CHECK: # %bb.0:
; CHECK-NEXT: vpslld $31, %xmm1, %xmm1
; CHECK-NEXT: vptestmd %xmm1, %xmm1, %k0
; CHECK-NEXT: vpslld $31, %xmm0, %xmm0
; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k0
; CHECK-NEXT: vpslld $31, %xmm1, %xmm0
; CHECK-NEXT: vptestmd %xmm0, %xmm0, %k1
; CHECK-NEXT: kshiftlb $4, %k1, %k1
; CHECK-NEXT: kshiftlb $4, %k0, %k0
; CHECK-NEXT: kshiftrb $4, %k0, %k0
; CHECK-NEXT: korb %k1, %k0, %k0
; CHECK-NEXT: korb %k0, %k1, %k0
; CHECK-NEXT: vpmovm2b %k0, %ymm0
; CHECK-NEXT: retq