mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
[X86][AVX] Lower v16i8/v8i16 binary shuffles using VTRUNC/TRUNCATE
This patch adds lowerShuffleWithVTRUNC to handle basic binary shuffles that can be lowered either as a pure ISD::TRUNCATE or a X86ISD::VTRUNC (with undef/zero values in the remaining upper elements). We concat the binary sources together into a single 256-bit source vector. To avoid regressions we perform this after we've tried to lower with PACKS/PACKUS which typically does a cleaner job than a concat. For non-AVX512VL cases we have to canonicalize VTRUNC cases to use a 512-bit source vectors (inserting undefs/zeros in the upper elements as necessary), truncate and then (possibly) extract the 128-bit result. This should address the last regressions in D66004 Differential Revision: https://reviews.llvm.org/D86093
This commit is contained in:
parent
3efdf8c410
commit
17226c7625
@ -11325,17 +11325,15 @@ static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
|
||||
//
|
||||
// But when avx512vl is available, one can just use a single vpmovdw
|
||||
// instruction.
|
||||
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
|
||||
MVT VT, SDValue V1, SDValue V2,
|
||||
SelectionDAG &DAG,
|
||||
const X86Subtarget &Subtarget) {
|
||||
// TODO: Merge with lowerShuffleAsVTRUNC.
|
||||
static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
|
||||
SDValue V2, ArrayRef<int> Mask,
|
||||
const X86Subtarget &Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
|
||||
|
||||
if (Mask.size() != VT.getVectorNumElements())
|
||||
return SDValue();
|
||||
|
||||
bool SwappedOps = false;
|
||||
|
||||
// TODO: Convert to use Zeroable bitmask.
|
||||
if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
|
||||
if (!ISD::isBuildVectorAllZeros(V1.getNode()))
|
||||
return SDValue();
|
||||
@ -11378,6 +11376,73 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
|
||||
return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
|
||||
}
|
||||
|
||||
// Attempt to match binary shuffle patterns as a truncate.
|
||||
static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1,
|
||||
SDValue V2, ArrayRef<int> Mask,
|
||||
const APInt &Zeroable,
|
||||
const X86Subtarget &Subtarget,
|
||||
SelectionDAG &DAG) {
|
||||
assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type");
|
||||
if (!Subtarget.hasAVX512())
|
||||
return SDValue();
|
||||
|
||||
unsigned NumElts = VT.getVectorNumElements();
|
||||
unsigned EltSizeInBits = VT.getScalarSizeInBits();
|
||||
unsigned MaxScale = 64 / VT.getScalarSizeInBits();
|
||||
for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
|
||||
// TODO: Support non-BWI VPMOVWB truncations?
|
||||
unsigned SrcEltBits = EltSizeInBits * Scale;
|
||||
if (SrcEltBits < 32 && !Subtarget.hasBWI())
|
||||
continue;
|
||||
|
||||
// Match shuffle <0,Scale,2*Scale,..,undef_or_zero,undef_or_zero,...>
|
||||
// Bail if the V2 elements are undef.
|
||||
unsigned NumHalfSrcElts = NumElts / Scale;
|
||||
unsigned NumSrcElts = 2 * NumHalfSrcElts;
|
||||
if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
|
||||
isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts))
|
||||
continue;
|
||||
|
||||
// The elements beyond the truncation must be undef/zero.
|
||||
unsigned UpperElts = NumElts - NumSrcElts;
|
||||
if (UpperElts > 0 &&
|
||||
!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
|
||||
continue;
|
||||
|
||||
// As we're using both sources then we need to concat them together
|
||||
// and truncate from the 256-bit src.
|
||||
MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2);
|
||||
SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2);
|
||||
|
||||
MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
|
||||
MVT SrcVT = MVT::getVectorVT(SrcSVT, 256 / SrcEltBits);
|
||||
Src = DAG.getBitcast(SrcVT, Src);
|
||||
|
||||
if (SrcVT.getVectorNumElements() == NumElts)
|
||||
return DAG.getNode(ISD::TRUNCATE, DL, VT, Src);
|
||||
|
||||
if (!Subtarget.hasVLX()) {
|
||||
// Non-VLX targets must truncate from a 512-bit type, so we need to
|
||||
// widen, truncate and then possibly extract the original 128-bit
|
||||
// vector.
|
||||
bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
|
||||
Src = widenSubVector(Src, !UndefUppers, Subtarget, DAG, DL, 512);
|
||||
unsigned NumWideSrcElts = Src.getValueType().getVectorNumElements();
|
||||
if (NumWideSrcElts >= NumElts) {
|
||||
// Widening means we can now use a regular TRUNCATE.
|
||||
MVT WideVT = MVT::getVectorVT(VT.getScalarType(), NumWideSrcElts);
|
||||
SDValue WideRes = DAG.getNode(ISD::TRUNCATE, DL, WideVT, Src);
|
||||
if (!WideVT.is128BitVector())
|
||||
WideRes = extract128BitVector(WideRes, 0, DAG, DL);
|
||||
return WideRes;
|
||||
}
|
||||
}
|
||||
return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
/// Check whether a compaction lowering can be done by dropping even
|
||||
/// elements and compute how many times even elements must be dropped.
|
||||
///
|
||||
@ -14733,7 +14798,7 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
||||
|
||||
// Try to use lower using a truncation.
|
||||
if (SDValue V =
|
||||
lowerShuffleWithVPMOV(DL, Mask, MVT::v8i16, V1, V2, DAG, Subtarget))
|
||||
lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
|
||||
return V;
|
||||
|
||||
int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
|
||||
@ -14816,6 +14881,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
||||
Subtarget))
|
||||
return V;
|
||||
|
||||
// Try to use lower using a truncation.
|
||||
if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable,
|
||||
Subtarget, DAG))
|
||||
return V;
|
||||
|
||||
// Try to use byte rotation instructions.
|
||||
if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask,
|
||||
Subtarget, DAG))
|
||||
@ -14922,7 +14992,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
|
||||
|
||||
// Try to use lower using a truncation.
|
||||
if (SDValue V =
|
||||
lowerShuffleWithVPMOV(DL, Mask, MVT::v16i8, V1, V2, DAG, Subtarget))
|
||||
lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
|
||||
return V;
|
||||
|
||||
if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable,
|
||||
Subtarget, DAG))
|
||||
return V;
|
||||
|
||||
// See if we can use SSE4A Extraction / Insertion.
|
||||
|
@ -42,11 +42,10 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
|
||||
;
|
||||
; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
|
||||
; AVX512BW-NEXT: vpand 16(%rdi), %xmm0, %xmm1
|
||||
; AVX512BW-NEXT: vpand (%rdi), %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
||||
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
|
||||
@ -143,11 +142,10 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
|
||||
;
|
||||
; AVX512F-LABEL: shuffle_v16i16_to_v8i16:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
|
||||
; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
|
||||
; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
|
||||
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
|
||||
@ -159,11 +157,10 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
|
||||
;
|
||||
; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
|
||||
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1],mem[2],xmm0[3],mem[4],xmm0[5],mem[6],xmm0[7]
|
||||
; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
|
||||
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
|
||||
@ -377,54 +374,42 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
|
||||
;
|
||||
; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
|
||||
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512VL-NEXT: vpmovdb %xmm1, %xmm1
|
||||
; AVX512VL-NEXT: vpmovdb %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512VL-NEXT: vpmovdb %ymm0, %xmm0
|
||||
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
|
||||
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
|
||||
; AVX512BWVL: # %bb.0:
|
||||
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512BWVL-NEXT: vpmovdb %xmm1, %xmm1
|
||||
; AVX512BWVL-NEXT: vpmovdb %xmm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512BWVL-NEXT: vpmovdb %ymm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512BWVL-NEXT: vzeroupper
|
||||
; AVX512BWVL-NEXT: retq
|
||||
;
|
||||
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v8i8:
|
||||
; AVX512VBMIVL: # %bb.0:
|
||||
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512VBMIVL-NEXT: vpmovdb %xmm1, %xmm1
|
||||
; AVX512VBMIVL-NEXT: vpmovdb %xmm0, %xmm0
|
||||
; AVX512VBMIVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512VBMIVL-NEXT: vpmovdb %ymm0, %xmm0
|
||||
; AVX512VBMIVL-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512VBMIVL-NEXT: vzeroupper
|
||||
; AVX512VBMIVL-NEXT: retq
|
||||
%vec = load <32 x i8>, <32 x i8>* %L
|
||||
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
|
||||
@ -1081,49 +1066,42 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
|
||||
;
|
||||
; AVX512F-LABEL: shuffle_v16i16_to_v4i16:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512F-NEXT: vpmovqw %zmm0, %xmm0
|
||||
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
|
||||
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512VL-NEXT: vpmovqw %ymm0, %xmm0
|
||||
; AVX512VL-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0
|
||||
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
|
||||
; AVX512BWVL: # %bb.0:
|
||||
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u>
|
||||
; AVX512BWVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
|
||||
; AVX512BWVL-NEXT: vmovq %xmm1, (%rsi)
|
||||
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512BWVL-NEXT: vpmovqw %ymm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512BWVL-NEXT: vzeroupper
|
||||
; AVX512BWVL-NEXT: retq
|
||||
;
|
||||
; AVX512VBMIVL-LABEL: shuffle_v16i16_to_v4i16:
|
||||
; AVX512VBMIVL: # %bb.0:
|
||||
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512VBMIVL-NEXT: vmovdqa {{.*#+}} xmm1 = <0,4,8,12,u,u,u,u>
|
||||
; AVX512VBMIVL-NEXT: vpermi2w 16(%rdi), %xmm0, %xmm1
|
||||
; AVX512VBMIVL-NEXT: vmovq %xmm1, (%rsi)
|
||||
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512VBMIVL-NEXT: vpmovqw %ymm0, %xmm0
|
||||
; AVX512VBMIVL-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512VBMIVL-NEXT: vzeroupper
|
||||
; AVX512VBMIVL-NEXT: retq
|
||||
%vec = load <16 x i16>, <16 x i16>* %L
|
||||
%strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
|
||||
@ -1199,54 +1177,42 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
|
||||
;
|
||||
; AVX512F-LABEL: shuffle_v32i8_to_v4i8:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512F-NEXT: vpmovqb %zmm0, %xmm0
|
||||
; AVX512F-NEXT: vmovd %xmm0, (%rsi)
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512VL-NEXT: vpmovqb %xmm1, %xmm1
|
||||
; AVX512VL-NEXT: vpmovqb %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0
|
||||
; AVX512VL-NEXT: vmovd %xmm0, (%rsi)
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512BW-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0
|
||||
; AVX512BW-NEXT: vmovd %xmm0, (%rsi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
|
||||
; AVX512BWVL: # %bb.0:
|
||||
; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512BWVL-NEXT: vpmovqb %xmm1, %xmm1
|
||||
; AVX512BWVL-NEXT: vpmovqb %xmm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0
|
||||
; AVX512BWVL-NEXT: vmovd %xmm0, (%rsi)
|
||||
; AVX512BWVL-NEXT: vzeroupper
|
||||
; AVX512BWVL-NEXT: retq
|
||||
;
|
||||
; AVX512VBMIVL-LABEL: shuffle_v32i8_to_v4i8:
|
||||
; AVX512VBMIVL: # %bb.0:
|
||||
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512VBMIVL-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512VBMIVL-NEXT: vpmovqb %xmm1, %xmm1
|
||||
; AVX512VBMIVL-NEXT: vpmovqb %xmm0, %xmm0
|
||||
; AVX512VBMIVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX512VBMIVL-NEXT: vmovdqa (%rdi), %ymm0
|
||||
; AVX512VBMIVL-NEXT: vpmovqb %ymm0, %xmm0
|
||||
; AVX512VBMIVL-NEXT: vmovd %xmm0, (%rsi)
|
||||
; AVX512VBMIVL-NEXT: vzeroupper
|
||||
; AVX512VBMIVL-NEXT: retq
|
||||
%vec = load <32 x i8>, <32 x i8>* %L
|
||||
%strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
|
||||
|
@ -178,20 +178,17 @@ define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
|
||||
define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
|
||||
; AVX512F-LABEL: shuffle_v64i8_to_v16i8:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
||||
; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
||||
; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
|
||||
; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
|
||||
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
|
||||
@ -211,20 +208,17 @@ define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
|
||||
;
|
||||
; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
||||
; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
||||
; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
|
||||
; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512BW-NEXT: vpmovdb %zmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
|
||||
@ -244,20 +238,17 @@ define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
|
||||
;
|
||||
; AVX512VBMI-LABEL: shuffle_v64i8_to_v16i8:
|
||||
; AVX512VBMI: # %bb.0:
|
||||
; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
|
||||
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512VBMI-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
||||
; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512VBMI-NEXT: vpmovdb %zmm1, %xmm1
|
||||
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512VBMI-NEXT: vzeroupper
|
||||
; AVX512VBMI-NEXT: retq
|
||||
;
|
||||
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v16i8:
|
||||
@ -293,44 +284,43 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512F-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
|
||||
; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512F-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
|
||||
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
|
||||
; AVX512VL-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,0,1,8,9,8,9,10,11,12,13,14,15]
|
||||
; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
|
||||
; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
|
||||
; AVX512VL-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
||||
; AVX512VL-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512VL-NEXT: vpmovqw %ymm1, %xmm1
|
||||
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512VL-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512BW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
|
||||
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512BW-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512BW-NEXT: vpmovqw %zmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
|
||||
@ -346,13 +336,14 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
|
||||
; AVX512VBMI: # %bb.0:
|
||||
; AVX512VBMI-NEXT: vpxor %xmm0, %xmm0, %xmm0
|
||||
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512VBMI-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
|
||||
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
|
||||
; AVX512VBMI-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
|
||||
; AVX512VBMI-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX512VBMI-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
|
||||
; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512VBMI-NEXT: vpmovqw %zmm1, %xmm1
|
||||
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
|
||||
; AVX512VBMI-NEXT: vzeroupper
|
||||
; AVX512VBMI-NEXT: retq
|
||||
;
|
||||
; AVX512VBMIVL-LABEL: shuffle_v32i16_to_v8i16:
|
||||
@ -386,20 +377,17 @@ define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
|
||||
define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
|
||||
; AVX512F-LABEL: shuffle_v64i8_to_v8i8:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512F-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512F-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512F-NEXT: vmovdqa 48(%rdi), %xmm3
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512F-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; AVX512F-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512F-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
|
||||
; AVX512F-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512F-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512F-NEXT: vpmovqb %zmm1, %xmm1
|
||||
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
|
||||
; AVX512F-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
|
||||
@ -415,20 +403,17 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
|
||||
;
|
||||
; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
|
||||
; AVX512BW: # %bb.0:
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512BW-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512BW-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; AVX512BW-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512BW-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
|
||||
; AVX512BW-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512BW-NEXT: vpmovqb %zmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
|
||||
; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
|
||||
@ -444,20 +429,17 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
|
||||
;
|
||||
; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8:
|
||||
; AVX512VBMI: # %bb.0:
|
||||
; AVX512VBMI-NEXT: vmovdqa (%rdi), %xmm0
|
||||
; AVX512VBMI-NEXT: vmovdqa 16(%rdi), %xmm1
|
||||
; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm0, %xmm0
|
||||
; AVX512VBMI-NEXT: vmovdqa 32(%rdi), %xmm2
|
||||
; AVX512VBMI-NEXT: vmovdqa 48(%rdi), %xmm3
|
||||
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm4 = <u,u,0,8,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
|
||||
; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm1, %xmm1
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
|
||||
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
|
||||
; AVX512VBMI-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512VBMI-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
|
||||
; AVX512VBMI-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX512VBMI-NEXT: vpmovqb %zmm1, %xmm1
|
||||
; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
|
||||
; AVX512VBMI-NEXT: vmovq %xmm0, (%rsi)
|
||||
; AVX512VBMI-NEXT: vzeroupper
|
||||
; AVX512VBMI-NEXT: retq
|
||||
;
|
||||
; AVX512VBMIVL-LABEL: shuffle_v64i8_to_v8i8:
|
||||
|
@ -1581,10 +1581,11 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
|
||||
;
|
||||
; AVX512F-LABEL: trunc2x4i32_8i16:
|
||||
; AVX512F: # %bb.0: # %entry
|
||||
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
|
||||
; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
|
||||
; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
|
||||
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
|
||||
; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: trunc2x4i32_8i16:
|
||||
@ -1597,10 +1598,11 @@ define <8 x i16> @trunc2x4i32_8i16(<4 x i32> %a, <4 x i32> %b) {
|
||||
;
|
||||
; AVX512BW-LABEL: trunc2x4i32_8i16:
|
||||
; AVX512BW: # %bb.0: # %entry
|
||||
; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
|
||||
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7]
|
||||
; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
|
||||
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
|
||||
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: trunc2x4i32_8i16:
|
||||
@ -1709,10 +1711,11 @@ define <16 x i8> @trunc2x8i16_16i8(<8 x i16> %a, <8 x i16> %b) {
|
||||
;
|
||||
; AVX512BW-LABEL: trunc2x8i16_16i8:
|
||||
; AVX512BW: # %bb.0: # %entry
|
||||
; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
|
||||
; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1
|
||||
; AVX512BW-NEXT: vpand %xmm2, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
|
||||
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
|
||||
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
||||
; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
|
||||
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
||||
; AVX512BW-NEXT: vzeroupper
|
||||
; AVX512BW-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: trunc2x8i16_16i8:
|
||||
|
@ -383,33 +383,88 @@ ret void
|
||||
}
|
||||
|
||||
define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
|
||||
; AVX-LABEL: interleaved_load_vf8_i8_stride4:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX-NEXT: vmovdqa (%rdi), %xmm1
|
||||
; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
|
||||
; AVX-NEXT: vpshufb %xmm0, %xmm2, %xmm3
|
||||
; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
|
||||
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4
|
||||
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3
|
||||
; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
|
||||
; AVX-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm4
|
||||
; AVX-NEXT: vpshufb %xmm3, %xmm1, %xmm3
|
||||
; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
|
||||
; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
||||
; AVX-NEXT: vpaddb %xmm3, %xmm1, %xmm1
|
||||
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
||||
; AVX-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; AVX-NEXT: retq
|
||||
; AVX1-LABEL: interleaved_load_vf8_i8_stride4:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX1-NEXT: vmovdqa (%rdi), %xmm1
|
||||
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
|
||||
; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3
|
||||
; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
|
||||
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
|
||||
; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
|
||||
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
||||
; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
||||
; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: interleaved_load_vf8_i8_stride4:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vmovdqa (%rdi), %xmm1
|
||||
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3
|
||||
; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
|
||||
; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4
|
||||
; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
||||
; AVX2-NEXT: vpaddb %xmm3, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
||||
; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512-LABEL: interleaved_load_vf8_i8_stride4:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vmovdqu (%rdi), %ymm0
|
||||
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vmovdqa (%rdi), %xmm2
|
||||
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3
|
||||
; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
|
||||
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
|
||||
; AVX512-NEXT: vpaddb %xmm1, %xmm2, %xmm1
|
||||
; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
|
||||
; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
|
||||
; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
|
||||
; AVX512-NEXT: vzeroupper
|
||||
; AVX512-NEXT: retq
|
||||
%wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16
|
||||
%v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
|
||||
%v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
|
||||
@ -529,10 +584,8 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(<64 x i8>* %ptr) {
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm5
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm4
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm1, %xmm6
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm5
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
|
||||
; AVX512-NEXT: vmovdqa (%rdi), %ymm5
|
||||
; AVX512-NEXT: vpmovdb %zmm5, %xmm5
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2,3]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm3, %xmm6
|
||||
@ -762,85 +815,83 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(<128 x i8>* %ptr) {
|
||||
;
|
||||
; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
|
||||
; AVX512: # %bb.0:
|
||||
; AVX512-NEXT: vmovdqa 112(%rdi), %xmm11
|
||||
; AVX512-NEXT: vmovdqa 112(%rdi), %xmm14
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm0, %xmm11, %xmm3
|
||||
; AVX512-NEXT: vmovdqa 96(%rdi), %xmm13
|
||||
; AVX512-NEXT: vpshufb %xmm0, %xmm13, %xmm0
|
||||
; AVX512-NEXT: vpshufb %xmm0, %xmm14, %xmm3
|
||||
; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2
|
||||
; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm0
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vmovdqa 80(%rdi), %xmm14
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm14, %xmm6
|
||||
; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm5
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
|
||||
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5
|
||||
; AVX512-NEXT: vpmovdb %zmm5, %xmm5
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm5[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm11, %xmm0
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm6
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
|
||||
; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3
|
||||
; AVX512-NEXT: vpmovdb %zmm3, %xmm3
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
|
||||
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3
|
||||
; AVX512-NEXT: vpmovdb %zmm3, %xmm3
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm3[0,1,2,3],ymm0[4,5,6,7]
|
||||
; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10
|
||||
; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm0
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm5
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm1, %xmm14, %xmm6
|
||||
; AVX512-NEXT: vpshufb %xmm1, %xmm4, %xmm7
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1,2,3,4,5],ymm0[6,7]
|
||||
; AVX512-NEXT: vmovdqa (%rdi), %xmm10
|
||||
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12
|
||||
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm7
|
||||
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm0, %xmm6
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm7, %xmm5
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
|
||||
; AVX512-NEXT: vpshufb %xmm1, %xmm12, %xmm6
|
||||
; AVX512-NEXT: vpshufb %xmm1, %xmm10, %xmm1
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm8[4,5,6,7]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm5
|
||||
; AVX512-NEXT: vpshufb %xmm1, %xmm13, %xmm6
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm6
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm6 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm14, %xmm2
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm4, %xmm3
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
|
||||
; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm3
|
||||
; AVX512-NEXT: vpshufb %xmm1, %xmm7, %xmm1
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm12, %xmm3
|
||||
; AVX512-NEXT: vpshufb %xmm6, %xmm10, %xmm5
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm3
|
||||
; AVX512-NEXT: vpshufb %xmm2, %xmm13, %xmm5
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm14, %xmm6
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm4, %xmm4
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm0[6,7]
|
||||
; AVX512-NEXT: vmovdqa (%rdi), %xmm12
|
||||
; AVX512-NEXT: vmovdqa 16(%rdi), %xmm13
|
||||
; AVX512-NEXT: vmovdqa 32(%rdi), %xmm7
|
||||
; AVX512-NEXT: vmovdqa 48(%rdi), %xmm0
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm5
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm5
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm4
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3],ymm8[4,5,6,7]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,2,6,10,14,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm5
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
|
||||
; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vpshufb %xmm2, %xmm7, %xmm2
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm2
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm10, %xmm4
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm5 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm11, %xmm6
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm10, %xmm1
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm13, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm5, %xmm12, %xmm5
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <u,u,u,u,3,7,11,15,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm14, %xmm4
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
|
||||
; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm11, %xmm5
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm10, %xmm6
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
|
||||
; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512-NEXT: vpshufb %xmm3, %xmm7, %xmm3
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm13, %xmm3
|
||||
; AVX512-NEXT: vpshufb %xmm4, %xmm12, %xmm4
|
||||
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
|
||||
; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
|
||||
; AVX512-NEXT: vpcmpeqb %zmm8, %zmm9, %k0
|
||||
; AVX512-NEXT: vpcmpeqb %zmm0, %zmm1, %k1
|
||||
; AVX512-NEXT: kxnord %k1, %k0, %k0
|
||||
|
Loading…
x
Reference in New Issue
Block a user