mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-24 03:33:20 +01:00
[X86] Add v32i8 shuffle lowering strategy to recognize two v4i64 vectors truncated to v4i8 and concatenated into the lower 8 bytes with undef/zero upper bytes.
This patch recognizes the shuffle pattern we get from a v8i64->v8i8 truncate when v8i64 isn't a legal type. With VLX we can use two VTRUNCs, unpckldq, and a insert_subvector. Diffrential Revision: https://reviews.llvm.org/D68374 llvm-svn: 373645
This commit is contained in:
parent
63c70efa3f
commit
3d66f2b205
@ -15520,6 +15520,42 @@ static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1,
|
||||
DAG.getTargetConstant(Immediate, DL, MVT::i8));
|
||||
}
|
||||
|
||||
// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
|
||||
// by zeroable elements in the remaining 24 elements. Turn this into two
|
||||
// vmovqb instructions shuffled together.
|
||||
static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
|
||||
SDValue V1, SDValue V2,
|
||||
ArrayRef<int> Mask,
|
||||
const APInt &Zeroable,
|
||||
SelectionDAG &DAG) {
|
||||
assert(VT == MVT::v32i8 && "Unexpected type!");
|
||||
|
||||
// The first 8 indices should be every 8th element.
|
||||
if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8))
|
||||
return SDValue();
|
||||
|
||||
// Remaining elements need to be zeroable.
|
||||
if (Zeroable.countLeadingOnes() < (Mask.size() - 8))
|
||||
return SDValue();
|
||||
|
||||
V1 = DAG.getBitcast(MVT::v4i64, V1);
|
||||
V2 = DAG.getBitcast(MVT::v4i64, V2);
|
||||
|
||||
V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1);
|
||||
V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2);
|
||||
|
||||
// The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in
|
||||
// the upper bits of the result using an unpckldq.
|
||||
SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2,
|
||||
{ 0, 1, 2, 3, 16, 17, 18, 19,
|
||||
4, 5, 6, 7, 20, 21, 22, 23 });
|
||||
// Insert the unpckldq into a zero vector to widen to v32i8.
|
||||
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8,
|
||||
DAG.getConstant(0, DL, MVT::v32i8), Unpack,
|
||||
DAG.getIntPtrConstant(0, DL));
|
||||
}
|
||||
|
||||
|
||||
/// Handle lowering of 4-lane 64-bit floating point shuffles.
|
||||
///
|
||||
/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
|
||||
@ -16120,6 +16156,14 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
||||
DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
|
||||
return V;
|
||||
|
||||
// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed
|
||||
// by zeroable elements in the remaining 24 elements. Turn this into two
|
||||
// vmovqb instructions shuffled together.
|
||||
if (Subtarget.hasVLX())
|
||||
if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2,
|
||||
Mask, Zeroable, DAG))
|
||||
return V;
|
||||
|
||||
// Otherwise fall back on generic lowering.
|
||||
return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
|
||||
Subtarget, DAG);
|
||||
|
@ -831,19 +831,12 @@ define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %x) nounwind "min-legal-vector-
|
||||
define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" {
|
||||
; CHECK-AVX512-LABEL: trunc_v8i64_v8i8:
|
||||
; CHECK-AVX512: # %bb.0:
|
||||
; CHECK-AVX512-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; CHECK-AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; CHECK-AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
|
||||
; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; CHECK-AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; CHECK-AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; CHECK-AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
|
||||
; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; CHECK-AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; CHECK-AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; CHECK-AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; CHECK-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
|
||||
; CHECK-AVX512-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
|
||||
; CHECK-AVX512-NEXT: vpmovqb %ymm1, %xmm1
|
||||
; CHECK-AVX512-NEXT: vpmovqb %ymm0, %xmm0
|
||||
; CHECK-AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; CHECK-AVX512-NEXT: vzeroupper
|
||||
; CHECK-AVX512-NEXT: retq
|
||||
;
|
||||
; CHECK-VBMI-LABEL: trunc_v8i64_v8i8:
|
||||
|
@ -549,20 +549,13 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
|
||||
;
|
||||
; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
|
||||
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
|
||||
; AVX512VL-NEXT: vpmovqb %ymm1, %xmm1
|
||||
; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
|
||||
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
|
||||
@ -585,20 +578,13 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
|
||||
;
|
||||
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
|
||||
; AVX512BWVL: # %bb.0:
|
||||
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3
|
||||
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
|
||||
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
|
||||
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
|
||||
; AVX512BWVL-NEXT: vpmovqb %ymm1, %xmm1
|
||||
; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512BWVL-NEXT: vzeroupper
|
||||
; AVX512BWVL-NEXT: retq
|
||||
;
|
||||
; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
|
||||
|
Loading…
Reference in New Issue
Block a user