mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-01 05:01:59 +01:00
[x86] Start to introduce bit-masking based blend lowering.
This is the simplest form of bit-math based blending which only fires when we are blending with zero and is relatively profitable. I've only enabled this path on very specific lowering strategies. I'm planning to widen its applicability in subsequent patches, but so far you'll notice that even though we get fewer shufps instructions, we *still* do the bit math in the FP execution port. I'm looking into why this is still happening. llvm-svn: 228124
This commit is contained in:
parent
62dda41067
commit
61ac2c112b
@ -7733,6 +7733,46 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
|
||||
return Zeroable;
|
||||
}
|
||||
|
||||
/// \brief Try to emit a bitmask instruction for a shuffle.
|
||||
///
|
||||
/// This handles cases where we can model a blend exactly as a bitmask due to
|
||||
/// one of the inputs being zeroable.
|
||||
static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
|
||||
SDValue V2, ArrayRef<int> Mask,
|
||||
SelectionDAG &DAG) {
|
||||
MVT EltVT = VT.getScalarType();
|
||||
int NumEltBits = EltVT.getSizeInBits();
|
||||
MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
|
||||
SDValue Zero = DAG.getConstant(0, IntEltVT);
|
||||
SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
|
||||
if (EltVT.isFloatingPoint()) {
|
||||
Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
|
||||
AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
|
||||
}
|
||||
SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
|
||||
SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
|
||||
SDValue V;
|
||||
for (int i = 0, Size = Mask.size(); i < Size; ++i) {
|
||||
if (Zeroable[i])
|
||||
continue;
|
||||
if (Mask[i] % Size != i)
|
||||
return SDValue(); // Not a blend.
|
||||
if (!V)
|
||||
V = Mask[i] < Size ? V1 : V2;
|
||||
else if (V != (Mask[i] < Size ? V1 : V2))
|
||||
return SDValue(); // Can only let one input through the mask.
|
||||
|
||||
VMaskOps[i] = AllOnes;
|
||||
}
|
||||
if (!V)
|
||||
return SDValue(); // No non-zeroable elements!
|
||||
|
||||
SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
|
||||
V = DAG.getNode(VT.isFloatingPoint() ? X86ISD::FAND : ISD::AND, DL, VT, V,
|
||||
VMask);
|
||||
return V;
|
||||
}
|
||||
|
||||
/// \brief Try to lower a vector shuffle as a byte shift (shifts in zeros).
|
||||
///
|
||||
/// Attempts to match a shuffle mask against the PSRLDQ and PSLLDQ SSE2
|
||||
@ -8743,17 +8783,21 @@ static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
Mask, Subtarget, DAG))
|
||||
return V;
|
||||
|
||||
if (Subtarget->hasSSE41())
|
||||
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
|
||||
Subtarget, DAG))
|
||||
return Blend;
|
||||
|
||||
if (SDValue Masked =
|
||||
lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, DAG))
|
||||
return Masked;
|
||||
|
||||
// Use dedicated unpack instructions for masks that match their pattern.
|
||||
if (isShuffleEquivalent(Mask, 0, 4, 1, 5))
|
||||
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v4i32, V1, V2);
|
||||
if (isShuffleEquivalent(Mask, 2, 6, 3, 7))
|
||||
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v4i32, V1, V2);
|
||||
|
||||
if (Subtarget->hasSSE41())
|
||||
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
|
||||
Subtarget, DAG))
|
||||
return Blend;
|
||||
|
||||
// Try to use byte rotation instructions.
|
||||
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
|
||||
if (Subtarget->hasSSSE3())
|
||||
@ -9455,17 +9499,21 @@ static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
|
||||
Mask, Subtarget, DAG))
|
||||
return V;
|
||||
|
||||
if (Subtarget->hasSSE41())
|
||||
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
|
||||
Subtarget, DAG))
|
||||
return Blend;
|
||||
|
||||
if (SDValue Masked =
|
||||
lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, DAG))
|
||||
return Masked;
|
||||
|
||||
// Use dedicated unpack instructions for masks that match their pattern.
|
||||
if (isShuffleEquivalent(Mask, 0, 8, 1, 9, 2, 10, 3, 11))
|
||||
return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, V1, V2);
|
||||
if (isShuffleEquivalent(Mask, 4, 12, 5, 13, 6, 14, 7, 15))
|
||||
return DAG.getNode(X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2);
|
||||
|
||||
if (Subtarget->hasSSE41())
|
||||
if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
|
||||
Subtarget, DAG))
|
||||
return Blend;
|
||||
|
||||
// Try to use byte rotation instructions.
|
||||
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
|
||||
DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
|
||||
|
@ -302,8 +302,7 @@ define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
|
||||
define <4 x i32> @PR19721(<4 x i32> %i) {
|
||||
; CHECK-LABEL: PR19721:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: xorps %xmm1, %xmm1
|
||||
; CHECK-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||
; CHECK-NEXT: andps LCPI19_0, %xmm0
|
||||
; CHECK-NEXT: retl
|
||||
%bc = bitcast <4 x i32> %i to i128
|
||||
%insert = and i128 %bc, -4294967296
|
||||
|
@ -1185,17 +1185,12 @@ define <4 x i32> @shuffle_v4i32_01zu(<4 x i32> %a) {
|
||||
define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) {
|
||||
; SSE-LABEL: shuffle_v4i32_0z23:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
|
||||
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
|
||||
; SSE-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v4i32_0z23:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
|
||||
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[2,0],xmm0[2,3]
|
||||
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
|
||||
ret <4 x i32> %shuffle
|
||||
@ -1204,16 +1199,12 @@ define <4 x i32> @shuffle_v4i32_0z23(<4 x i32> %a) {
|
||||
define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) {
|
||||
; SSE-LABEL: shuffle_v4i32_01z3:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
|
||||
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
|
||||
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v4i32_01z3:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
|
||||
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
|
||||
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
|
||||
ret <4 x i32> %shuffle
|
||||
@ -1222,23 +1213,17 @@ define <4 x i32> @shuffle_v4i32_01z3(<4 x i32> %a) {
|
||||
define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) {
|
||||
; SSE2-LABEL: shuffle_v4i32_012z:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
|
||||
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE3-LABEL: shuffle_v4i32_012z:
|
||||
; SSE3: # BB#0:
|
||||
; SSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
|
||||
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
|
||||
; SSE3-NEXT: andps {{.*}}(%rip), %xmm0
|
||||
; SSE3-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: shuffle_v4i32_012z:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,0],xmm0[2,0]
|
||||
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
|
||||
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: shuffle_v4i32_012z:
|
||||
@ -1265,16 +1250,12 @@ define <4 x i32> @shuffle_v4i32_012z(<4 x i32> %a) {
|
||||
define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) {
|
||||
; SSE-LABEL: shuffle_v4i32_0zz3:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,0]
|
||||
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
|
||||
; SSE-NEXT: andps {{.*}}(%rip), %xmm0
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: shuffle_v4i32_0zz3:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
|
||||
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],xmm1[0,0]
|
||||
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,3,1]
|
||||
; AVX-NEXT: vandps {{.*}}(%rip), %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%shuffle = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 4, i32 3>
|
||||
ret <4 x i32> %shuffle
|
||||
|
@ -352,17 +352,13 @@ define <4 x i32> @combine_bitwise_ops_test3b(<4 x i32> %a, <4 x i32> %b, <4 x i3
|
||||
; SSE2-LABEL: combine_bitwise_ops_test3b:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: xorps %xmm1, %xmm0
|
||||
; SSE2-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: combine_bitwise_ops_test3b:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: xorps %xmm1, %xmm0
|
||||
; SSSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[1,3]
|
||||
; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
|
||||
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: combine_bitwise_ops_test3b:
|
||||
@ -475,19 +471,13 @@ define <4 x i32> @combine_bitwise_ops_test6b(<4 x i32> %a, <4 x i32> %b, <4 x i3
|
||||
; SSE2-LABEL: combine_bitwise_ops_test6b:
|
||||
; SSE2: # BB#0:
|
||||
; SSE2-NEXT: xorps %xmm1, %xmm0
|
||||
; SSE2-NEXT: xorps %xmm1, %xmm1
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
|
||||
; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
|
||||
; SSE2-NEXT: movaps %xmm1, %xmm0
|
||||
; SSE2-NEXT: andps {{.*}}(%rip), %xmm0
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSSE3-LABEL: combine_bitwise_ops_test6b:
|
||||
; SSSE3: # BB#0:
|
||||
; SSSE3-NEXT: xorps %xmm1, %xmm0
|
||||
; SSSE3-NEXT: xorps %xmm1, %xmm1
|
||||
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
|
||||
; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
|
||||
; SSSE3-NEXT: movaps %xmm1, %xmm0
|
||||
; SSSE3-NEXT: andps {{.*}}(%rip), %xmm0
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; SSE41-LABEL: combine_bitwise_ops_test6b:
|
||||
|
Loading…
x
Reference in New Issue
Block a user