mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
[X86][AVX] lowerShuffleWithVTRUNC - extend to support v16i16/v32i8 binary shuffles.
This requires a few additional SrcVT vs DstVT padding cases in getAVX512TruncNode.
This commit is contained in:
parent
b8e421a05f
commit
5b3aace9c9
@ -11292,19 +11292,28 @@ static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
|
||||
const X86Subtarget &Subtarget,
|
||||
SelectionDAG &DAG, bool ZeroUppers) {
|
||||
MVT SrcVT = Src.getSimpleValueType();
|
||||
MVT DstSVT = DstVT.getScalarType();
|
||||
unsigned NumDstElts = DstVT.getVectorNumElements();
|
||||
unsigned NumSrcElts = SrcVT.getVectorNumElements();
|
||||
unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits();
|
||||
|
||||
// Perform a direct ISD::TRUNCATE if possible.
|
||||
if (NumSrcElts == NumDstElts)
|
||||
return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src);
|
||||
|
||||
if (NumSrcElts > NumDstElts) {
|
||||
MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts);
|
||||
MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
|
||||
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
|
||||
return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits());
|
||||
}
|
||||
|
||||
if ((NumSrcElts * DstEltSizeInBits) >= 128) {
|
||||
MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts);
|
||||
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src);
|
||||
return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
|
||||
DstVT.getSizeInBits());
|
||||
}
|
||||
|
||||
// Non-VLX targets must truncate from a 512-bit type, so we need to
|
||||
// widen, truncate and then possibly extract the original subvector.
|
||||
if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) {
|
||||
@ -11312,9 +11321,13 @@ static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src,
|
||||
return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers);
|
||||
}
|
||||
|
||||
// Fallback to a X86ISD::VTRUNC.
|
||||
// TODO: Handle cases where we go from 512-bit vectors to sub-128-bit vectors.
|
||||
return DAG.getNode(X86ISD::VTRUNC, DL, DstVT, Src);
|
||||
// Fallback to a X86ISD::VTRUNC, padding if necessary.
|
||||
MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits);
|
||||
SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src);
|
||||
if (DstVT != TruncVT)
|
||||
Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL,
|
||||
DstVT.getSizeInBits());
|
||||
return Trunc;
|
||||
}
|
||||
|
||||
static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
|
||||
@ -11413,7 +11426,8 @@ static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
|
||||
const APInt &Zeroable,
|
||||
const X86Subtarget &Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
|
||||
assert((VT.is128BitVector() || VT.is256BitVector()) &&
|
||||
"Unexpected VTRUNC type");
|
||||
if (!Subtarget.hasAVX512())
|
||||
return SDValue();
|
||||
|
||||
@ -16893,6 +16907,11 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
||||
Subtarget))
|
||||
return V;
|
||||
|
||||
// Try to use lower using a truncation.
|
||||
if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable,
|
||||
Subtarget, DAG))
|
||||
return V;
|
||||
|
||||
// Try to use shift instructions.
|
||||
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
|
||||
Zeroable, Subtarget, DAG))
|
||||
@ -17003,6 +17022,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
||||
Subtarget))
|
||||
return V;
|
||||
|
||||
// Try to use lower using a truncation.
|
||||
if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable,
|
||||
Subtarget, DAG))
|
||||
return V;
|
||||
|
||||
// Try to use shift instructions.
|
||||
if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
|
||||
Zeroable, Subtarget, DAG))
|
||||
|
@ -176,89 +176,12 @@ define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
|
||||
}
|
||||
|
||||
define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
|
||||
; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
||||
; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
|
||||
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
||||
; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512VL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
||||
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512VL-NEXT: vpmovdb %ymm1, %xmm1
|
||||
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
||||
; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
|
||||
; AVX512BWVL: # %bb.0:
|
||||
; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
||||
; AVX512BWVL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512BWVL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
||||
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512BWVL-NEXT: vpmovdb %ymm1, %xmm1
|
||||
; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512BWVL-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512BWVL-NEXT: vzeroupper
|
||||
; AVX512BWVL-NEXT: retq
|
||||
;
|
||||
; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
|
||||
; AVX512VBMI: # %bb.0:
|
||||
; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
||||
; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512VBMI-NEXT: vpmovdb %zmm1, %xmm1
|
||||
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512VBMI-NEXT: vzeroupper
|
||||
; AVX512VBMI-NEXT: retq
|
||||
;
|
||||
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
|
||||
; AVX512VBMIVL: # %bb.0:
|
||||
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
|
||||
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512VBMIVL-NEXT: vpermt2b 32(%rdi), %ymm0, %ymm1
|
||||
; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
|
||||
; AVX512VBMIVL-NEXT: vzeroupper
|
||||
; AVX512VBMIVL-NEXT: retq
|
||||
; AVX512-LABEL: shuffle_v64i8_to_v16i8:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
|
||||
; AVX512-NEXT: vpmovdb %zmm0, (%rsi)
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%vec = load <64 x i8>, <64 x i8>* %L
|
||||
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
|
||||
store <16 x i8> %strided.vec, <16 x i8>* %S
|
||||
@ -280,80 +203,12 @@ define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
|
||||
}
|
||||
|
||||
define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
|
||||
; AVX512F-LABEL: shuffle_v32i16_to_v8i16:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
|
||||
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
|
||||
; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512VL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
||||
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1
|
||||
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
|
||||
; AVX512BWVL: # %bb.0:
|
||||
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
|
||||
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512BWVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
|
||||
; AVX512BWVL-NEXT: vmovdqa %xmm1, (%rsi)
|
||||
; AVX512BWVL-NEXT: vzeroupper
|
||||
; AVX512BWVL-NEXT: retq
|
||||
;
|
||||
; AVX512VBMI-LABEL: shuffle_v32i16_to_v8i16:
|
||||
; AVX512VBMI: # %bb.0:
|
||||
; AVX512VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512VBMI-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX512VBMI-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
|
||||
; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512VBMI-NEXT: vpmovqw %zmm1, %xmm1
|
||||
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512VBMI-NEXT: vzeroupper
|
||||
; AVX512VBMI-NEXT: retq
|
||||
;
|
||||
; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
|
||||
; AVX512VBMIVL: # %bb.0:
|
||||
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm0 = [0,4,8,12,16,20,24,28]
|
||||
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512VBMIVL-NEXT: vpermt2w 32(%rdi), %ymm0, %ymm1
|
||||
; AVX512VBMIVL-NEXT: vmovdqa %xmm1, (%rsi)
|
||||
; AVX512VBMIVL-NEXT: vzeroupper
|
||||
; AVX512VBMIVL-NEXT: retq
|
||||
; AVX512-LABEL: shuffle_v32i16_to_v8i16:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
|
||||
; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%vec = load <32 x i16>, <32 x i16>* %L
|
||||
%strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
|
||||
store <8 x i16> %strided.vec, <8 x i16>* %S
|
||||
@ -375,81 +230,13 @@ define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
|
||||
}
|
||||
|
||||
define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
|
||||
; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
|
||||
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
|
||||
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1
|
||||
; AVX512VL-NEXT: vpmovqb %ymm1, %xmm1
|
||||
; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
|
||||
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
|
||||
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
|
||||
; AVX512BWVL: # %bb.0:
|
||||
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1
|
||||
; AVX512BWVL-NEXT: vpmovqb %ymm1, %xmm1
|
||||
; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512BWVL-NEXT: vzeroupper
|
||||
; AVX512BWVL-NEXT: retq
|
||||
;
|
||||
; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
|
||||
; AVX512VBMI: # %bb.0:
|
||||
; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512VBMI-NEXT: vpmovqb %zmm1, %xmm1
|
||||
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
|
||||
; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512VBMI-NEXT: vzeroupper
|
||||
; AVX512VBMI-NEXT: retq
|
||||
;
|
||||
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
|
||||
; AVX512VBMIVL: # %bb.0:
|
||||
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512VBMIVL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224]
|
||||
; AVX512VBMIVL-NEXT: vpermi2b 32(%rdi), %ymm0, %ymm1
|
||||
; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
|
||||
; AVX512VBMIVL-NEXT: vzeroupper
|
||||
; AVX512VBMIVL-NEXT: retq
|
||||
; AVX512-LABEL: shuffle_v64i8_to_v8i8:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
|
||||
; AVX512-NEXT: vpmovqb %zmm0, %xmm0
|
||||
; AVX512-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%vec = load <64 x i8>, <64 x i8>* %L
|
||||
%strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
|
||||
store <8 x i8> %strided.vec, <8 x i8>* %S
|
||||
|
@ -4843,19 +4843,13 @@ define <32 x i8> @shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_
|
||||
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512VLBW-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
|
||||
; AVX512VLBW: # %bb.0:
|
||||
; AVX512VLBW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
|
||||
; AVX512VLBW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
||||
; AVX512VLBW-NEXT: vpsrlw $8, %zmm0, %zmm0
|
||||
; AVX512VLBW-NEXT: vpmovwb %zmm0, %ymm0
|
||||
; AVX512VLBW-NEXT: retq
|
||||
;
|
||||
; AVX512VLVBMI-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
|
||||
; AVX512VLVBMI: # %bb.0:
|
||||
; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63]
|
||||
; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0
|
||||
; AVX512VLVBMI-NEXT: retq
|
||||
; AVX512VL-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
|
||||
; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
||||
; AVX512VL-NEXT: vpsrlw $8, %zmm0, %zmm0
|
||||
; AVX512VL-NEXT: vpmovwb %zmm0, %ymm0
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; XOPAVX1-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62:
|
||||
; XOPAVX1: # %bb.0:
|
||||
|
@ -576,46 +576,41 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) {
|
||||
;
|
||||
; AVX512-LABEL: interleaved_load_vf16_i8_stride4:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm5
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm4
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
|
||||
; AVX512-NEXT: vmovdqa (%rdi), %ymm5
|
||||
; AVX512-NEXT: vpmovdb %zmm5, %xmm5
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3]
|
||||
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
|
||||
; AVX512-NEXT: vpmovdb %zmm0, %xmm8
|
||||
; AVX512-NEXT: vmovdqa (%rdi), %xmm1
|
||||
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2
|
||||
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm3
|
||||
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm4
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm6
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm2, %xmm5
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm6
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm5
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm7
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm0, %xmm6
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm7
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm6
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm7
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm6
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm7
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm6
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm7 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm7, %xmm0, %xmm7
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3]
|
||||
; AVX512-NEXT: vpshufb %xmm7, %xmm2, %xmm0
|
||||
; AVX512-NEXT: vpshufb %xmm7, %xmm1, %xmm7
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm3, %xmm3
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm2, %xmm2
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
|
||||
; AVX512-NEXT: vpcmpeqb %zmm5, %zmm8, %k0
|
||||
; AVX512-NEXT: vpcmpeqb %zmm0, %zmm4, %k1
|
||||
; AVX512-NEXT: vpcmpeqb %zmm1, %zmm0, %k1
|
||||
; AVX512-NEXT: kxnorw %k1, %k0, %k0
|
||||
; AVX512-NEXT: vpmovm2b %k0, %zmm0
|
||||
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
|
||||
|
Loading…
x
Reference in New Issue
Block a user