1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-18 18:42:46 +02:00

[x86] try harder to form 256-bit unpck*

This is another part of a problem noted in PR42024:
https://bugs.llvm.org/show_bug.cgi?id=42024

The AVX2 code may use awkward 256-bit shuffles vs. the AVX code that gets split
into the expected 128-bit unpack instructions. We have to be selective in
matching the types where we try to do this though. Otherwise, we can end up
with more instructions (in the case of v8x32/v4x64).

Differential Revision: https://reviews.llvm.org/D72575
This commit is contained in:
Sanjay Patel 2020-01-17 10:20:25 -05:00
parent 0cea4f6d7d
commit 2a9f2d098c
4 changed files with 93 additions and 27 deletions

View File

@ -10929,6 +10929,32 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
return SDValue();
}
/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
/// followed by unpack 256-bit.
static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
SmallVector<int, 32> Unpckl, Unpckh;
createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
unsigned UnpackOpcode;
if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
UnpackOpcode = X86ISD::UNPCKL;
else if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
UnpackOpcode = X86ISD::UNPCKH;
else
return SDValue();
// This is a "natural" unpack operation (rather than the 128-bit sectored
// operation implemented by AVX). We need to rearrange 64-bit chunks of the
// input in order to use the x86 instruction.
V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
V1 = DAG.getBitcast(VT, V1);
return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
}
static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
int Delta) {
int Size = (int)Mask.size();
@ -16210,9 +16236,14 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
return V;
// If the shuffle patterns aren't repeated but it is a single input, directly
// generate a cross-lane VPERMD instruction.
if (V2.isUndef()) {
// Try to produce a fixed cross-128-bit lane permute followed by unpack
// because that should be faster than the variable permute alternatives.
if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
return V;
// If the shuffle patterns aren't repeated but it's a single input, directly
// generate a cross-lane VPERMD instruction.
SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
}
@ -16294,6 +16325,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
if (V2.isUndef()) {
// Try to produce a fixed cross-128-bit lane permute followed by unpack
// because that should be faster than the variable permute alternatives.
if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
return V;
// There are no generalized cross-lane shuffle operations available on i16
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
@ -16396,6 +16432,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
// There are no generalized cross-lane shuffle operations available on i8
// element types.
if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
// Try to produce a fixed cross-128-bit lane permute followed by unpack
// because that should be faster than the variable permute alternatives.
if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
return V;
if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
return V;

View File

@ -1688,6 +1688,21 @@ namespace llvm {
}
}
/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
/// imposed by AVX and specific to the unary pattern. Example:
/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
template <typename T = int>
void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<T> &Mask, bool Lo) {
assert(Mask.empty() && "Expected an empty shuffle mask vector");
int NumElts = VT.getVectorNumElements();
for (int i = 0; i < NumElts; ++i) {
int Pos = i / 2;
Pos += (Lo ? 0 : NumElts / 2);
Mask.push_back(Pos);
}
}
/// Helper function to scale a shuffle or target shuffle mask, replacing each
/// mask index with the scaled sequential indices for an equivalent narrowed
/// mask. This is the reverse process to canWidenShuffleElements, but can

View File

@ -171,12 +171,9 @@ define void @splat2_i8(<32 x i8>* %s, <64 x i8>* %d) {
;
; AVX2-LABEL: splat2_i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = mem[0,2,1,3]
; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi)
; AVX2-NEXT: vmovdqu %ymm1, (%rsi)
; AVX2-NEXT: vzeroupper
@ -221,12 +218,9 @@ define void @splat2_i16(<16 x i16>* %s, <32 x i16>* %d) {
;
; AVX2-LABEL: splat2_i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqu (%rdi), %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,0,1,2,3,2,3,4,5,4,5,6,7,6,7,8,9,8,9,10,11,10,11,12,13,12,13,14,15,14,15]
; AVX2-NEXT: vpshufb %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = mem[0,2,1,3]
; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi)
; AVX2-NEXT: vmovdqu %ymm1, (%rsi)
; AVX2-NEXT: vzeroupper
@ -269,11 +263,9 @@ define void @splat2_i32(<8 x i32>* %s, <16 x i32>* %d) {
;
; AVX2-LABEL: splat2_i32:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovups (%rdi), %ymm0
; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm1
; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [4,4,5,5,6,6,7,7]
; AVX2-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = mem[0,2,1,3]
; AVX2-NEXT: vpermilps {{.*#+}} ymm1 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
; AVX2-NEXT: vmovups %ymm0, 32(%rsi)
; AVX2-NEXT: vmovups %ymm1, (%rsi)
; AVX2-NEXT: vzeroupper

View File

@ -1517,11 +1517,29 @@ define <8 x i32> @shuffle_v8i32_00112233(<8 x i32> %a, <8 x i32> %b) {
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2OR512VL-LABEL: shuffle_v8i32_00112233:
; AVX2OR512VL: # %bb.0:
; AVX2OR512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
; AVX2OR512VL-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2OR512VL-NEXT: retq
; AVX2-SLOW-LABEL: shuffle_v8i32_00112233:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: shuffle_v8i32_00112233:
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX2-FAST-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v8i32_00112233:
; AVX512VL-SLOW: # %bb.0:
; AVX512VL-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512VL-SLOW-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
; AVX512VL-SLOW-NEXT: retq
;
; AVX512VL-FAST-LABEL: shuffle_v8i32_00112233:
; AVX512VL-FAST: # %bb.0:
; AVX512VL-FAST-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3]
; AVX512VL-FAST-NEXT: vpermps %ymm0, %ymm1, %ymm0
; AVX512VL-FAST-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
ret <8 x i32> %shuffle
}
@ -1689,10 +1707,10 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
;
; AVX2-SLOW-LABEL: shuffle_v8i32_08991abb:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-SLOW-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,1,3]
; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-SLOW-NEXT: retq
;
@ -1700,7 +1718,7 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
; AVX2-FAST: # %bb.0:
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <0,u,1,u,1,u,u,u>
; AVX2-FAST-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,1,1,2,2,3,3]
; AVX2-FAST-NEXT: vpermps %ymm1, %ymm2, %ymm1
; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
; AVX2-FAST-NEXT: retq