1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[X86][AVX] lowerShuffleWithVTRUNC - extend to support v16i16/v32i8 binary shuffles.

This requires a few additional SrcVT vs DstVT padding cases in getAVX512TruncNode.
This commit is contained in:
Simon Pilgrim 2020-08-18 15:24:28 +01:00
parent b8e421a05f
commit 5b3aace9c9
4 changed files with 79 additions and 279 deletions

View File

@ -11292,19 +11292,28 @@ static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
const X86Subtarget &Subtarget,
SelectionDAG &DAG, bool ZeroUppers) {
MVT SrcVT = Src.getSimpleValueType();
MVT DstSVT = DstVT.getScalarType();
unsigned NumDstElts = DstVT.getVectorNumElements();
unsigned NumSrcElts = SrcVT.getVectorNumElements();
unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
// Perform a direct ISD::TRUNCATE if possible.
if (NumSrcElts == NumDstElts)
return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
if (NumSrcElts > NumDstElts) {
MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
}
if ((NumSrcElts * DstEltSizeInBits) >= 128) {
MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
DstVT.getSizeInBits());
}
// Non-VLX targets must truncate from a 512-bit type, so we need to
// widen, truncate and then possibly extract the original subvector.
if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
@ -11312,9 +11321,13 @@ static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
}
// Fallback to a X86ISD::VTRUNC.
// TODO: Handle cases where we go from 512-bit vectors to sub-128-bit vectors.
return DAG.getNode(X86ISD::VTRUNC, DL, DstVT, Src);
// Fallback to a X86ISD::VTRUNC, padding if necessary.
MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
if (DstVT != TruncVT)
Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
DstVT.getSizeInBits());
return Trunc;
}
static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
@ -11413,7 +11426,8 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unexpected VTRUNC type");
if (!Subtarget.hasAVX512())
return SDValue();
@ -16893,6 +16907,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget))
return V;
// Try to use lower using a truncation.
if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return V;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@ -17003,6 +17022,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Subtarget))
return V;
// Try to use lower using a truncation.
if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
Subtarget, DAG))
return V;
// Try to use shift instructions.
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))

View File

@ -176,89 +176,12 @@ define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
}
define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512VL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1
; AVX512VL-NEXT: vpmovdb %ymm1, %xmm1
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
; AVX512BWVL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512BWVL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BWVL-NEXT: vpmovdb %ymm1, %xmm1
; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
; AVX512VBMI-NEXT: vpmovdb %zmm1, %xmm1
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
; AVX512VBMIVL-NEXT: vpermt2b 32(%rdi), %ymm0, %ymm1
; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
; AVX512-LABEL: shuffle_v64i8_to_v16i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
store <16 x i8> %strided.vec, <16 x i8>* %S
@ -280,80 +203,12 @@ define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
}
define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512VL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1
; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16:
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512VBMI-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX512VBMI-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
; AVX512VBMI-NEXT: vpmovqw %zmm1, %xmm1
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
; AVX512-LABEL: shuffle_v32i16_to_v8i16:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <32 x i16>, <32 x i16>* %L
%strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
store <8 x i16> %strided.vec, <8 x i16>* %S
@ -375,81 +230,13 @@ define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
}
define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
; AVX512F: # %bb.0:
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512VL-NEXT: vpmovqb %ymm1, %xmm1
; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
; AVX512BWVL: # %bb.0:
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
; AVX512BWVL-NEXT: vpmovqb %ymm1, %xmm1
; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm0
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm0, %xmm0
; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm2, %xmm1
; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
; AVX512VBMI-NEXT: vpmovqb %zmm1, %xmm1
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
;
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
; AVX512VBMIVL: # %bb.0:
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]
; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
; AVX512VBMIVL-NEXT: vzeroupper
; AVX512VBMIVL-NEXT: retq
; AVX512-LABEL: shuffle_v64i8_to_v8i8:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vpmovqb %zmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, (%rsi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%vec = load <64 x i8>, <64 x i8>* %L
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
store <8 x i8> %strided.vec, <8 x i8>* %S

View File

@ -4843,19 +4843,13 @@ define <32 x i8> @shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
; AVX512VLBW-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
; AVX512VLBW: # %bb.0:
; AVX512VLBW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VLBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VLBW-NEXT: retq
;
; AVX512VLVBMI-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
; AVX512VLVBMI: # %bb.0:
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
; AVX512VLVBMI-NEXT: retq
; AVX512VL-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512VL-NEXT: vpsrlw $8, %zmm0, %zmm0
; AVX512VL-NEXT: vpmovwb %zmm0, %ymm0
; AVX512VL-NEXT: retq
;
; XOPAVX1-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
; XOPAVX1: # %bb.0:

View File

@ -576,46 +576,41 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) {
;
; AVX512-LABEL: interleaved_load_vf16_i8_stride4:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm5
; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm4
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; AVX512-NEXT: vmovdqa (%rdi), %ymm5
; AVX512-NEXT: vpmovdb %zmm5, %xmm5
; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3]
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm8
; AVX512-NEXT: vmovdqa (%rdi), %xmm1
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm3
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm4
; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm6
; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm5
; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm6
; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm5
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm7
; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm6
; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm7
; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm6
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm7
; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm6
; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm7
; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm6
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm4
; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm7
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3]
; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm0
; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm7
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4
; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3
; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
; AVX512-NEXT: vpcmpeqb %zmm5, %zmm8, %k0
; AVX512-NEXT: vpcmpeqb %zmm0, %zmm4, %k1
; AVX512-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
; AVX512-NEXT: kxnorw %k1, %k0, %k0
; AVX512-NEXT: vpmovm2b %k0, %zmm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0