1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-02-01 05:01:59 +01:00

[x86] Start to introduce bit-masking based blend lowering.

This is the simplest form of bit-math based blending which only fires
when we are blending with zero and is relatively profitable. I've only
enabled this path on very specific lowering strategies. I'm planning to
widen its applicability in subsequent patches, but so far you'll notice
that even though we get fewer shufps instructions, we *still* do the bit
math in the FP execution port. I'm looking into why this is still
happening.

llvm-svn: 228124
This commit is contained in:
Chandler Carruth 2015-02-04 09:06:05 +00:00
parent 62dda41067
commit 61ac2c112b
4 changed files with 72 additions and 54 deletions

View File

@ -7733,6 +7733,46 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
return Zeroable;
}
/// \brief Try to emit a bitmask instruction for a shuffle.
///
/// This handles cases where we can model a blend exactly as a bitmask due to
/// one of the inputs being zeroable.
static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
SDValue V2, ArrayRef<int> Mask,
SelectionDAG &DAG) {
MVT EltVT = VT.getScalarType();
int NumEltBits = EltVT.getSizeInBits();
MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
SDValue Zero = DAG.getConstant(0, IntEltVT);
SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
if (EltVT.isFloatingPoint()) {
Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
}
SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
SDValue V;
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
if (Zeroable[i])
continue;
if (Mask[i] % Size != i)
return SDValue(); // Not a blend.
if (!V)
V = Mask[i] < Size ? V1 : V2;
else if (V != (Mask[i] < Size ? V1 : V2))
return SDValue(); // Can only let one input through the mask.
VMaskOps[i] = AllOnes;
}
if (!V)
return SDValue(); // No non-zeroable elements!
SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
V = DAG.getNode(VT.isFloatingPoint() ? X86ISD::FAND : ISD::AND, DL, VT, V,
VMask);
return V;
}
/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
///
/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
@ -8743,17 +8783,21 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Mask, Subtarget, DAG))
return V;
if (Subtarget->hasSSE41())
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
if (SDValue Masked =
lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
if (Subtarget->hasSSE41())
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
Subtarget, DAG))
return Blend;
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
if (Subtarget->hasSSSE3())
@ -9455,17 +9499,21 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
Mask, Subtarget, DAG))
return V;
if (Subtarget->hasSSE41())
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
Subtarget, DAG))
return Blend;
if (SDValue Masked =
lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
return Masked;
// Use dedicated unpack instructions for masks that match their pattern.
if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
if (Subtarget->hasSSE41())
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
Subtarget, DAG))
return Blend;
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))

View File

@ -302,8 +302,7 @@ define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
define <4 x i32> @PR19721(<4 x i32> %i) {
; CHECK-LABEL: PR19721:
; CHECK: ## BB#0:
; CHECK-NEXT: xorps %xmm1, %xmm1
; CHECK-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; CHECK-NEXT: andps LCPI19_0, %xmm0
; CHECK-NEXT: retl
%bc = bitcast <4 x i32> %i to i128
%insert = and i128 %bc, -4294967296

View File

@ -1185,17 +1185,12 @@ define <4 x i32> @shuffle_v4i32_01zu(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) {
; SSE-LABEL: shuffle_v4i32_0z23:
; SSE: # BB#0:
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
; SSE-NEXT: movaps %xmm1, %xmm0
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_0z23:
; AVX: # BB#0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[2,3]
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
ret <4 x i32> %shuffle
@ -1204,16 +1199,12 @@ define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) {
; SSE-LABEL: shuffle_v4i32_01z3:
; SSE: # BB#0:
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_01z3:
; AVX: # BB#0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
ret <4 x i32> %shuffle
@ -1222,23 +1213,17 @@ define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) {
; SSE2-LABEL: shuffle_v4i32_012z:
; SSE2: # BB#0:
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE3-LABEL: shuffle_v4i32_012z:
; SSE3: # BB#0:
; SSE3-NEXT: xorps %xmm1, %xmm1
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: shuffle_v4i32_012z:
; SSSE3: # BB#0:
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: shuffle_v4i32_012z:
@ -1265,16 +1250,12 @@ define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) {
define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) {
; SSE-LABEL: shuffle_v4i32_0zz3:
; SSE: # BB#0:
; SSE-NEXT: xorps %xmm1, %xmm1
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,0]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4i32_0zz3:
; AVX: # BB#0:
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,0]
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq
%shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
ret <4 x i32> %shuffle

View File

@ -352,17 +352,13 @@ define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i3
; SSE2-LABEL: combine_bitwise_ops_test3b:
; SSE2: # BB#0:
; SSE2-NEXT: xorps %xmm1, %xmm0
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test3b:
; SSSE3: # BB#0:
; SSSE3-NEXT: xorps %xmm1, %xmm0
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test3b:
@ -475,19 +471,13 @@ define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i3
; SSE2-LABEL: combine_bitwise_ops_test6b:
; SSE2: # BB#0:
; SSE2-NEXT: xorps %xmm1, %xmm0
; SSE2-NEXT: xorps %xmm1, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSE2-NEXT: movaps %xmm1, %xmm0
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSSE3-LABEL: combine_bitwise_ops_test6b:
; SSSE3: # BB#0:
; SSSE3-NEXT: xorps %xmm1, %xmm0
; SSSE3-NEXT: xorps %xmm1, %xmm1
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
; SSSE3-NEXT: movaps %xmm1, %xmm0
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
; SSSE3-NEXT: retq
;
; SSE41-LABEL: combine_bitwise_ops_test6b: