1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 11:02:59 +02:00

[X86] Lower _mm[256|512]_[mask[z]]_avg_epu[8|16] intrinsics to native llvm IR

Differential Revision: https://reviews.llvm.org/D37560

llvm-svn: 313013
This commit is contained in:
Yael Tsafrir 2017-09-12 07:50:35 +00:00
parent a68e72a0c8
commit 78fe0a651c
21 changed files with 1170 additions and 336 deletions

View File

@ -379,12 +379,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd128">,
Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty,
llvm_v8i16_ty], [IntrNoMem, Commutative]>;
def int_x86_sse2_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb128">,
Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty,
llvm_v16i8_ty], [IntrNoMem, Commutative]>;
def int_x86_sse2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw128">,
Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty,
llvm_v8i16_ty], [IntrNoMem, Commutative]>;
def int_x86_sse2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw128">,
Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty,
llvm_v16i8_ty], [IntrNoMem, Commutative]>;
@ -1678,12 +1672,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd256">,
Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty,
llvm_v16i16_ty], [IntrNoMem, Commutative]>;
def int_x86_avx2_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb256">,
Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
llvm_v32i8_ty], [IntrNoMem, Commutative]>;
def int_x86_avx2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw256">,
Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
llvm_v16i16_ty], [IntrNoMem, Commutative]>;
def int_x86_avx2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw256">,
Intrinsic<[llvm_v4i64_ty], [llvm_v32i8_ty,
llvm_v32i8_ty], [IntrNoMem, Commutative]>;
@ -4947,24 +4935,6 @@ let TargetPrefix = "x86" in {
def int_x86_avx512_mask_pmulh_w_256 : GCCBuiltin<"__builtin_ia32_pmulhw256_mask">,
Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pavg_b_512 : GCCBuiltin<"__builtin_ia32_pavgb512_mask">,
Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty,
llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pavg_w_512 : GCCBuiltin<"__builtin_ia32_pavgw512_mask">,
Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty,
llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pavg_b_128 : GCCBuiltin<"__builtin_ia32_pavgb128_mask">,
Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pavg_b_256 : GCCBuiltin<"__builtin_ia32_pavgb256_mask">,
Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pavg_w_128 : GCCBuiltin<"__builtin_ia32_pavgw128_mask">,
Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pavg_w_256 : GCCBuiltin<"__builtin_ia32_pavgw256_mask">,
Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
def int_x86_avx512_mask_pmaddw_d_128 :
GCCBuiltin<"__builtin_ia32_pmaddwd128_mask">,
Intrinsic<[llvm_v4i32_ty],

View File

@ -252,7 +252,10 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
Name.startswith("avx512.mask.move.s") || // Added in 4.0
Name.startswith("avx512.cvtmask2") || // Added in 5.0
(Name.startswith("xop.vpcom") && // Added in 3.2
F->arg_size() == 2))
F->arg_size() == 2) ||
Name.startswith("sse2.pavg") || // Added in 6.0
Name.startswith("avx2.pavg") || // Added in 6.0
Name.startswith("avx512.mask.pavg")) // Added in 6.0
return true;
return false;
@ -1972,6 +1975,25 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
LoadInst *LI = Builder.CreateAlignedLoad(BC, VTy->getBitWidth() / 8);
LI->setMetadata(M->getMDKindID("nontemporal"), Node);
Rep = LI;
} else if (IsX86 &&
(Name.startswith("sse2.pavg") || Name.startswith("avx2.pavg") ||
Name.startswith("avx512.mask.pavg"))) {
// llvm.x86.sse2.pavg.b/w, llvm.x86.avx2.pavg.b/w,
// llvm.x86.avx512.mask.pavg.b/w
Value *A = CI->getArgOperand(0);
Value *B = CI->getArgOperand(1);
VectorType *ZextType = VectorType::getExtendedElementVectorType(
cast<VectorType>(A->getType()));
Value *ExtendedA = Builder.CreateZExt(A, ZextType);
Value *ExtendedB = Builder.CreateZExt(B, ZextType);
Value *Sum = Builder.CreateAdd(ExtendedA, ExtendedB);
Value *AddOne = Builder.CreateAdd(Sum, ConstantInt::get(ZextType, 1));
Value *ShiftR = Builder.CreateLShr(AddOne, ConstantInt::get(ZextType, 1));
Rep = Builder.CreateTrunc(ShiftR, A->getType());
if (CI->getNumArgOperands() > 2) {
Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
CI->getArgOperand(2));
}
} else if (IsNVVM && (Name == "abs.i" || Name == "abs.ll")) {
Value *Arg = CI->getArgOperand(0);
Value *Neg = Builder.CreateNeg(Arg, "neg");

View File

@ -383,8 +383,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
@ -818,12 +816,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx512_mask_pavg_b_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx512_mask_pavg_b_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx512_mask_pavg_b_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx512_mask_pavg_w_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx512_mask_pavg_w_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx512_mask_pavg_w_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_128, INTR_TYPE_1OP_MASK,
X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_256, INTR_TYPE_1OP_MASK,
@ -1593,8 +1585,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
X86_INTRINSIC_DATA(sse2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(sse2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),

View File

@ -0,0 +1,449 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BWVL
define <16 x i8> @avg_v16i8_mask(<16 x i8> %a, <16 x i8> %b, <16 x i8> %src, i16 %mask) nounwind {
; AVX512F-LABEL: avg_v16i8_mask:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpblendvb %xmm1, %xmm0, %xmm2, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v16i8_mask:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: kmovd %edi, %k1
; AVX512BWVL-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1}
; AVX512BWVL-NEXT: vmovdqa %xmm2, %xmm0
; AVX512BWVL-NEXT: retq
%za = zext <16 x i8> %a to <16 x i16>
%zb = zext <16 x i8> %b to <16 x i16>
%add = add nuw nsw <16 x i16> %za, %zb
%add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%trunc = trunc <16 x i16> %lshr to <16 x i8>
%mask1 = bitcast i16 %mask to <16 x i1>
%res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> %src
ret <16 x i8> %res
}
define <16 x i8> @avg_v16i8_maskz(<16 x i8> %a, <16 x i8> %b, i16 %mask) nounwind {
; AVX512F-LABEL: avg_v16i8_maskz:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpavgb %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v16i8_maskz:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: kmovd %edi, %k1
; AVX512BWVL-NEXT: vpavgb %xmm1, %xmm0, %xmm0 {%k1} {z}
; AVX512BWVL-NEXT: retq
%za = zext <16 x i8> %a to <16 x i16>
%zb = zext <16 x i8> %b to <16 x i16>
%add = add nuw nsw <16 x i16> %za, %zb
%add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%trunc = trunc <16 x i16> %lshr to <16 x i8>
%mask1 = bitcast i16 %mask to <16 x i1>
%res = select <16 x i1> %mask1, <16 x i8> %trunc, <16 x i8> zeroinitializer
ret <16 x i8> %res
}
define <32 x i8> @avg_v32i8_mask(<32 x i8> %a, <32 x i8> %b, <32 x i8> %src, i32 %mask) nounwind {
; AVX512F-LABEL: avg_v32i8_mask:
; AVX512F: # BB#0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: kmovw (%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i8_mask:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: kmovd %edi, %k1
; AVX512BWVL-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1}
; AVX512BWVL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512BWVL-NEXT: retq
%za = zext <32 x i8> %a to <32 x i16>
%zb = zext <32 x i8> %b to <32 x i16>
%add = add nuw nsw <32 x i16> %za, %zb
%add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%trunc = trunc <32 x i16> %lshr to <32 x i8>
%mask1 = bitcast i32 %mask to <32 x i1>
%res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> %src
ret <32 x i8> %res
}
define <32 x i8> @avg_v32i8_maskz(<32 x i8> %a, <32 x i8> %b, i32 %mask) nounwind {
; AVX512F-LABEL: avg_v32i8_maskz:
; AVX512F: # BB#0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: kmovw (%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i8_maskz:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: kmovd %edi, %k1
; AVX512BWVL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 {%k1} {z}
; AVX512BWVL-NEXT: retq
%za = zext <32 x i8> %a to <32 x i16>
%zb = zext <32 x i8> %b to <32 x i16>
%add = add nuw nsw <32 x i16> %za, %zb
%add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%trunc = trunc <32 x i16> %lshr to <32 x i8>
%mask1 = bitcast i32 %mask to <32 x i1>
%res = select <32 x i1> %mask1, <32 x i8> %trunc, <32 x i8> zeroinitializer
ret <32 x i8> %res
}
define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64 %mask) nounwind {
; AVX512F-LABEL: avg_v64i8_mask:
; AVX512F: # BB#0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $64, %rsp
; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: shrq $32, %rax
; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm6
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm8
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm7
; AVX512F-NEXT: vpavgb %xmm7, %xmm6, %xmm6
; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm7
; AVX512F-NEXT: vpavgb %xmm7, %xmm8, %xmm7
; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm7, %ymm1, %ymm1
; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
; AVX512F-NEXT: kmovw (%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v64i8_mask:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: kmovq %rdi, %k1
; AVX512BWVL-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BWVL-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BWVL-NEXT: retq
%za = zext <64 x i8> %a to <64 x i16>
%zb = zext <64 x i8> %b to <64 x i16>
%add = add nuw nsw <64 x i16> %za, %zb
%add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%trunc = trunc <64 x i16> %lshr to <64 x i8>
%mask1 = bitcast i64 %mask to <64 x i1>
%res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> %src
ret <64 x i8> %res
}
define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwind {
; AVX512F-LABEL: avg_v64i8_maskz:
; AVX512F: # BB#0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $64, %rsp
; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: shrq $32, %rax
; AVX512F-NEXT: movl %eax, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm5
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm6
; AVX512F-NEXT: vpavgb %xmm6, %xmm4, %xmm4
; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6
; AVX512F-NEXT: vpavgb %xmm6, %xmm5, %xmm5
; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: kmovw (%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v64i8_maskz:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: kmovq %rdi, %k1
; AVX512BWVL-NEXT: vpavgb %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BWVL-NEXT: retq
%za = zext <64 x i8> %a to <64 x i16>
%zb = zext <64 x i8> %b to <64 x i16>
%add = add nuw nsw <64 x i16> %za, %zb
%add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%trunc = trunc <64 x i16> %lshr to <64 x i8>
%mask1 = bitcast i64 %mask to <64 x i1>
%res = select <64 x i1> %mask1, <64 x i8> %trunc, <64 x i8> zeroinitializer
ret <64 x i8> %res
}
define <8 x i16> @avg_v8i16_mask(<8 x i16> %a, <8 x i16> %b, <8 x i16> %src, i8 %mask) nounwind {
; AVX512F-LABEL: avg_v8i16_mask:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1
; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v8i16_mask:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: kmovd %edi, %k1
; AVX512BWVL-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1}
; AVX512BWVL-NEXT: vmovdqa %xmm2, %xmm0
; AVX512BWVL-NEXT: retq
%za = zext <8 x i16> %a to <8 x i32>
%zb = zext <8 x i16> %b to <8 x i32>
%add = add nuw nsw <8 x i32> %za, %zb
%add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%trunc = trunc <8 x i32> %lshr to <8 x i16>
%mask1 = bitcast i8 %mask to <8 x i1>
%res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> %src
ret <8 x i16> %res
}
define <8 x i16> @avg_v8i16_maskz(<8 x i16> %a, <8 x i16> %b, i8 %mask) nounwind {
; AVX512F-LABEL: avg_v8i16_maskz:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpavgw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovqw %zmm1, %xmm1
; AVX512F-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v8i16_maskz:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: kmovd %edi, %k1
; AVX512BWVL-NEXT: vpavgw %xmm1, %xmm0, %xmm0 {%k1} {z}
; AVX512BWVL-NEXT: retq
%za = zext <8 x i16> %a to <8 x i32>
%zb = zext <8 x i16> %b to <8 x i32>
%add = add nuw nsw <8 x i32> %za, %zb
%add1 = add nuw nsw <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%trunc = trunc <8 x i32> %lshr to <8 x i16>
%mask1 = bitcast i8 %mask to <8 x i1>
%res = select <8 x i1> %mask1, <8 x i16> %trunc, <8 x i16> zeroinitializer
ret <8 x i16> %res
}
define <16 x i16> @avg_v16i16_mask(<16 x i16> %a, <16 x i16> %b, <16 x i16> %src, i16 %mask) nounwind {
; AVX512F-LABEL: avg_v16i16_mask:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
; AVX512F-NEXT: vpand %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: vpandn %ymm2, %ymm1, %ymm1
; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v16i16_mask:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: kmovd %edi, %k1
; AVX512BWVL-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1}
; AVX512BWVL-NEXT: vmovdqa %ymm2, %ymm0
; AVX512BWVL-NEXT: retq
%za = zext <16 x i16> %a to <16 x i32>
%zb = zext <16 x i16> %b to <16 x i32>
%add = add nuw nsw <16 x i32> %za, %zb
%add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%trunc = trunc <16 x i32> %lshr to <16 x i16>
%mask1 = bitcast i16 %mask to <16 x i1>
%res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> %src
ret <16 x i16> %res
}
define <16 x i16> @avg_v16i16_maskz(<16 x i16> %a, <16 x i16> %b, i16 %mask) nounwind {
; AVX512F-LABEL: avg_v16i16_maskz:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: kmovw %edi, %k1
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v16i16_maskz:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: kmovd %edi, %k1
; AVX512BWVL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 {%k1} {z}
; AVX512BWVL-NEXT: retq
%za = zext <16 x i16> %a to <16 x i32>
%zb = zext <16 x i16> %b to <16 x i32>
%add = add nuw nsw <16 x i32> %za, %zb
%add1 = add nuw nsw <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%trunc = trunc <16 x i32> %lshr to <16 x i16>
%mask1 = bitcast i16 %mask to <16 x i1>
%res = select <16 x i1> %mask1, <16 x i16> %trunc, <16 x i16> zeroinitializer
ret <16 x i16> %res
}
define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src, i32 %mask) nounwind {
; AVX512F-LABEL: avg_v32i16_mask:
; AVX512F: # BB#0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: kmovw (%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm6, %xmm6
; AVX512F-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm7, %xmm7
; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero,xmm7[8],zero,xmm7[9],zero,xmm7[10],zero,xmm7[11],zero,xmm7[12],zero,xmm7[13],zero,xmm7[14],zero,xmm7[15],zero
; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm1
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i16_mask:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: kmovd %edi, %k1
; AVX512BWVL-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BWVL-NEXT: vmovdqa64 %zmm2, %zmm0
; AVX512BWVL-NEXT: retq
%za = zext <32 x i16> %a to <32 x i32>
%zb = zext <32 x i16> %b to <32 x i32>
%add = add nuw nsw <32 x i32> %za, %zb
%add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%trunc = trunc <32 x i32> %lshr to <32 x i16>
%mask1 = bitcast i32 %mask to <32 x i1>
%res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> %src
ret <32 x i16> %res
}
define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nounwind {
; AVX512F-LABEL: avg_v32i16_maskz:
; AVX512F: # BB#0:
; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: movq %rsp, %rbp
; AVX512F-NEXT: andq $-32, %rsp
; AVX512F-NEXT: subq $32, %rsp
; AVX512F-NEXT: movl %edi, (%rsp)
; AVX512F-NEXT: kmovw (%rsp), %k1
; AVX512F-NEXT: kmovw {{[0-9]+}}(%rsp), %k2
; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z}
; AVX512F-NEXT: vpmovdb %zmm5, %xmm5
; AVX512F-NEXT: vpavgw %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vpavgw %ymm2, %ymm0, %ymm0
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX512F-NEXT: vpsllw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpsraw $15, %ymm2, %ymm2
; AVX512F-NEXT: vpand %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: movq %rbp, %rsp
; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512BWVL-LABEL: avg_v32i16_maskz:
; AVX512BWVL: # BB#0:
; AVX512BWVL-NEXT: kmovd %edi, %k1
; AVX512BWVL-NEXT: vpavgw %zmm1, %zmm0, %zmm0 {%k1} {z}
; AVX512BWVL-NEXT: retq
%za = zext <32 x i16> %a to <32 x i32>
%zb = zext <32 x i16> %b to <32 x i32>
%add = add nuw nsw <32 x i32> %za, %zb
%add1 = add nuw nsw <32 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%lshr = lshr <32 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%trunc = trunc <32 x i32> %lshr to <32 x i16>
%mask1 = bitcast i32 %mask to <32 x i1>
%res = select <32 x i1> %mask1, <32 x i16> %trunc, <32 x i16> zeroinitializer
ret <32 x i16> %res
}

View File

@ -5,7 +5,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v4i8:
; SSE2: # BB#0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@ -33,7 +33,7 @@ define void @avg_v4i8(<4 x i8>* %a, <4 x i8>* %b) {
ret void
}
define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v8i8:
; SSE2: # BB#0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
@ -61,7 +61,7 @@ define void @avg_v8i8(<8 x i8>* %a, <8 x i8>* %b) {
ret void
}
define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v16i8:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rsi), %xmm0
@ -87,7 +87,7 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) {
ret void
}
define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v32i8:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rdi), %xmm3
@ -265,7 +265,7 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
ret void
}
define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v64i8:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rdi), %xmm6
@ -450,8 +450,6 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
; AVX1-LABEL: avg_v64i8:
; AVX1: # BB#0:
; AVX1-NEXT: subq $24, %rsp
; AVX1-NEXT: .Lcfi0:
; AVX1-NEXT: .cfi_def_cfa_offset 32
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
@ -727,7 +725,7 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
ret void
}
define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v4i16:
; SSE2: # BB#0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
@ -755,7 +753,7 @@ define void @avg_v4i16(<4 x i16>* %a, <4 x i16>* %b) {
ret void
}
define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v8i16:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rsi), %xmm0
@ -781,7 +779,7 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) {
ret void
}
define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v16i16:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rdi), %xmm2
@ -890,7 +888,7 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
ret void
}
define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v32i16:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rdi), %xmm4
@ -1116,7 +1114,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
ret void
}
define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v4i8_2:
; SSE2: # BB#0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@ -1144,7 +1142,7 @@ define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) {
ret void
}
define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v8i8_2:
; SSE2: # BB#0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
@ -1172,7 +1170,7 @@ define void @avg_v8i8_2(<8 x i8>* %a, <8 x i8>* %b) {
ret void
}
define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v16i8_2:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
@ -1198,7 +1196,7 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) {
ret void
}
define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v32i8_2:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rdi), %xmm3
@ -1376,7 +1374,7 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
ret void
}
define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) nounwind {
; SSE2-LABEL: avg_v64i8_2:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rsi), %xmm14
@ -1750,7 +1748,7 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
}
define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v4i16_2:
; SSE2: # BB#0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
@ -1778,7 +1776,7 @@ define void @avg_v4i16_2(<4 x i16>* %a, <4 x i16>* %b) {
ret void
}
define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v8i16_2:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
@ -1804,7 +1802,7 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) {
ret void
}
define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v16i16_2:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rdi), %xmm2
@ -1913,7 +1911,7 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
ret void
}
define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) nounwind {
; SSE2-LABEL: avg_v32i16_2:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rdi), %xmm4
@ -2139,7 +2137,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
ret void
}
define void @avg_v4i8_const(<4 x i8>* %a) {
define void @avg_v4i8_const(<4 x i8>* %a) nounwind {
; SSE2-LABEL: avg_v4i8_const:
; SSE2: # BB#0:
; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
@ -2162,7 +2160,7 @@ define void @avg_v4i8_const(<4 x i8>* %a) {
ret void
}
define void @avg_v8i8_const(<8 x i8>* %a) {
define void @avg_v8i8_const(<8 x i8>* %a) nounwind {
; SSE2-LABEL: avg_v8i8_const:
; SSE2: # BB#0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
@ -2185,7 +2183,7 @@ define void @avg_v8i8_const(<8 x i8>* %a) {
ret void
}
define void @avg_v16i8_const(<16 x i8>* %a) {
define void @avg_v16i8_const(<16 x i8>* %a) nounwind {
; SSE2-LABEL: avg_v16i8_const:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
@ -2208,7 +2206,7 @@ define void @avg_v16i8_const(<16 x i8>* %a) {
ret void
}
define void @avg_v32i8_const(<32 x i8>* %a) {
define void @avg_v32i8_const(<32 x i8>* %a) nounwind {
; SSE2-LABEL: avg_v32i8_const:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rdi), %xmm5
@ -2341,7 +2339,7 @@ define void @avg_v32i8_const(<32 x i8>* %a) {
ret void
}
define void @avg_v64i8_const(<64 x i8>* %a) {
define void @avg_v64i8_const(<64 x i8>* %a) nounwind {
; SSE2-LABEL: avg_v64i8_const:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rdi), %xmm5
@ -2661,7 +2659,7 @@ define void @avg_v64i8_const(<64 x i8>* %a) {
ret void
}
define void @avg_v4i16_const(<4 x i16>* %a) {
define void @avg_v4i16_const(<4 x i16>* %a) nounwind {
; SSE2-LABEL: avg_v4i16_const:
; SSE2: # BB#0:
; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
@ -2684,7 +2682,7 @@ define void @avg_v4i16_const(<4 x i16>* %a) {
ret void
}
define void @avg_v8i16_const(<8 x i16>* %a) {
define void @avg_v8i16_const(<8 x i16>* %a) nounwind {
; SSE2-LABEL: avg_v8i16_const:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rdi), %xmm0
@ -2707,7 +2705,7 @@ define void @avg_v8i16_const(<8 x i16>* %a) {
ret void
}
define void @avg_v16i16_const(<16 x i16>* %a) {
define void @avg_v16i16_const(<16 x i16>* %a) nounwind {
; SSE2-LABEL: avg_v16i16_const:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rdi), %xmm3
@ -2795,7 +2793,7 @@ define void @avg_v16i16_const(<16 x i16>* %a) {
ret void
}
define void @avg_v32i16_const(<32 x i16>* %a) {
define void @avg_v32i16_const(<32 x i16>* %a) nounwind {
; SSE2-LABEL: avg_v32i16_const:
; SSE2: # BB#0:
; SSE2-NEXT: movdqa (%rdi), %xmm7
@ -2968,3 +2966,332 @@ define void @avg_v32i16_const(<32 x i16>* %a) {
store <32 x i16> %5, <32 x i16>* undef, align 4
ret void
}
define <16 x i8> @avg_v16i8_3(<16 x i8> %a, <16 x i8> %b) nounwind {
; SSE2-LABEL: avg_v16i8_3:
; SSE2: # BB#0:
; SSE2-NEXT: pavgb %xmm1, %xmm0
; SSE2-NEXT: retq
;
; AVX-LABEL: avg_v16i8_3:
; AVX: # BB#0:
; AVX-NEXT: vpavgb %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%za = zext <16 x i8> %a to <16 x i16>
%zb = zext <16 x i8> %b to <16 x i16>
%add = add nuw nsw <16 x i16> %za, %zb
%add1 = add nuw nsw <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%res = trunc <16 x i16> %lshr to <16 x i8>
ret <16 x i8> %res
}
define <32 x i8> @avg_v32i8_3(<32 x i8> %a, <32 x i8> %b) nounwind {
; SSE2-LABEL: avg_v32i8_3:
; SSE2: # BB#0:
; SSE2-NEXT: pxor %xmm5, %xmm5
; SSE2-NEXT: movdqa %xmm0, %xmm6
; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
; SSE2-NEXT: movdqa %xmm1, %xmm7
; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
; SSE2-NEXT: movdqa %xmm2, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
; SSE2-NEXT: paddw %xmm6, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
; SSE2-NEXT: paddw %xmm2, %xmm0
; SSE2-NEXT: movdqa %xmm3, %xmm2
; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
; SSE2-NEXT: paddw %xmm7, %xmm2
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
; SSE2-NEXT: paddw %xmm3, %xmm1
; SSE2-NEXT: pcmpeqd %xmm3, %xmm3
; SSE2-NEXT: psubw %xmm3, %xmm4
; SSE2-NEXT: psubw %xmm3, %xmm0
; SSE2-NEXT: psubw %xmm3, %xmm2
; SSE2-NEXT: psubw %xmm3, %xmm1
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: psrlw $1, %xmm2
; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: psrlw $1, %xmm4
; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm3, %xmm4
; SSE2-NEXT: pand %xmm3, %xmm0
; SSE2-NEXT: packuswb %xmm4, %xmm0
; SSE2-NEXT: pand %xmm3, %xmm2
; SSE2-NEXT: pand %xmm3, %xmm1
; SSE2-NEXT: packuswb %xmm2, %xmm1
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v32i8_3:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; AVX1-NEXT: vpaddw %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
; AVX1-NEXT: vpaddw %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpaddw %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpsubw %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpsubw %xmm1, %xmm4, %xmm4
; AVX1-NEXT: vpsubw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm3
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v32i8_3:
; AVX2: # BB#0:
; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: avg_v32i8_3:
; AVX512: # BB#0:
; AVX512-NEXT: vpavgb %ymm1, %ymm0, %ymm0
; AVX512-NEXT: retq
%za = zext <32 x i8> %a to <32 x i16>
%zb = zext <32 x i8> %b to <32 x i16>
%add = add nuw nsw <32 x i16> %za, %zb
%add1 = add nuw nsw <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%res = trunc <32 x i16> %lshr to <32 x i8>
ret <32 x i8> %res
}
define <64 x i8> @avg_v64i8_3(<64 x i8> %a, <64 x i8> %b) nounwind {
; SSE2-LABEL: avg_v64i8_3:
; SSE2: # BB#0:
; SSE2-NEXT: pxor %xmm9, %xmm9
; SSE2-NEXT: movdqa %xmm0, %xmm10
; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
; SSE2-NEXT: movdqa %xmm1, %xmm11
; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
; SSE2-NEXT: movdqa %xmm2, %xmm12
; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
; SSE2-NEXT: movdqa %xmm3, %xmm13
; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
; SSE2-NEXT: movdqa %xmm4, %xmm8
; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm9[8],xmm8[9],xmm9[9],xmm8[10],xmm9[10],xmm8[11],xmm9[11],xmm8[12],xmm9[12],xmm8[13],xmm9[13],xmm8[14],xmm9[14],xmm8[15],xmm9[15]
; SSE2-NEXT: paddw %xmm10, %xmm8
; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3],xmm4[4],xmm9[4],xmm4[5],xmm9[5],xmm4[6],xmm9[6],xmm4[7],xmm9[7]
; SSE2-NEXT: paddw %xmm4, %xmm0
; SSE2-NEXT: movdqa %xmm5, %xmm4
; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm9[8],xmm4[9],xmm9[9],xmm4[10],xmm9[10],xmm4[11],xmm9[11],xmm4[12],xmm9[12],xmm4[13],xmm9[13],xmm4[14],xmm9[14],xmm4[15],xmm9[15]
; SSE2-NEXT: paddw %xmm11, %xmm4
; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
; SSE2-NEXT: paddw %xmm5, %xmm1
; SSE2-NEXT: movdqa %xmm6, %xmm5
; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
; SSE2-NEXT: paddw %xmm12, %xmm5
; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
; SSE2-NEXT: paddw %xmm6, %xmm2
; SSE2-NEXT: movdqa %xmm7, %xmm6
; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm9[8],xmm6[9],xmm9[9],xmm6[10],xmm9[10],xmm6[11],xmm9[11],xmm6[12],xmm9[12],xmm6[13],xmm9[13],xmm6[14],xmm9[14],xmm6[15],xmm9[15]
; SSE2-NEXT: paddw %xmm13, %xmm6
; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
; SSE2-NEXT: paddw %xmm7, %xmm3
; SSE2-NEXT: pcmpeqd %xmm7, %xmm7
; SSE2-NEXT: psubw %xmm7, %xmm8
; SSE2-NEXT: psubw %xmm7, %xmm0
; SSE2-NEXT: psubw %xmm7, %xmm4
; SSE2-NEXT: psubw %xmm7, %xmm1
; SSE2-NEXT: psubw %xmm7, %xmm5
; SSE2-NEXT: psubw %xmm7, %xmm2
; SSE2-NEXT: psubw %xmm7, %xmm6
; SSE2-NEXT: psubw %xmm7, %xmm3
; SSE2-NEXT: psrlw $1, %xmm3
; SSE2-NEXT: psrlw $1, %xmm6
; SSE2-NEXT: psrlw $1, %xmm2
; SSE2-NEXT: psrlw $1, %xmm5
; SSE2-NEXT: psrlw $1, %xmm1
; SSE2-NEXT: psrlw $1, %xmm4
; SSE2-NEXT: psrlw $1, %xmm0
; SSE2-NEXT: psrlw $1, %xmm8
; SSE2-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
; SSE2-NEXT: pand %xmm7, %xmm8
; SSE2-NEXT: pand %xmm7, %xmm0
; SSE2-NEXT: packuswb %xmm8, %xmm0
; SSE2-NEXT: pand %xmm7, %xmm4
; SSE2-NEXT: pand %xmm7, %xmm1
; SSE2-NEXT: packuswb %xmm4, %xmm1
; SSE2-NEXT: pand %xmm7, %xmm5
; SSE2-NEXT: pand %xmm7, %xmm2
; SSE2-NEXT: packuswb %xmm5, %xmm2
; SSE2-NEXT: pand %xmm7, %xmm6
; SSE2-NEXT: pand %xmm7, %xmm3
; SSE2-NEXT: packuswb %xmm6, %xmm3
; SSE2-NEXT: retq
;
; AVX1-LABEL: avg_v64i8_3:
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm5 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm11 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm10 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpaddw %xmm7, %xmm5, %xmm12
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX1-NEXT: vpaddw %xmm1, %xmm4, %xmm13
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpaddw %xmm4, %xmm6, %xmm14
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpaddw %xmm2, %xmm0, %xmm15
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm6 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpaddw %xmm6, %xmm8, %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
; AVX1-NEXT: vpaddw %xmm2, %xmm11, %xmm2
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
; AVX1-NEXT: vpaddw %xmm7, %xmm9, %xmm7
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
; AVX1-NEXT: vpaddw %xmm3, %xmm10, %xmm3
; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
; AVX1-NEXT: vpsubw %xmm5, %xmm12, %xmm8
; AVX1-NEXT: vpsubw %xmm5, %xmm13, %xmm4
; AVX1-NEXT: vpsubw %xmm5, %xmm14, %xmm0
; AVX1-NEXT: vpsubw %xmm5, %xmm15, %xmm1
; AVX1-NEXT: vpsubw %xmm5, %xmm6, %xmm6
; AVX1-NEXT: vpsubw %xmm5, %xmm2, %xmm2
; AVX1-NEXT: vpsubw %xmm5, %xmm7, %xmm7
; AVX1-NEXT: vpsubw %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vpsrlw $1, %xmm3, %xmm9
; AVX1-NEXT: vpsrlw $1, %xmm7, %xmm5
; AVX1-NEXT: vpsrlw $1, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $1, %xmm6, %xmm6
; AVX1-NEXT: vpsrlw $1, %xmm1, %xmm1
; AVX1-NEXT: vpsrlw $1, %xmm0, %xmm0
; AVX1-NEXT: vpsrlw $1, %xmm4, %xmm4
; AVX1-NEXT: vpsrlw $1, %xmm8, %xmm7
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm3, %xmm7, %xmm7
; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm7[0],xmm4[0]
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm1
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vpshufb %xmm3, %xmm5, %xmm2
; AVX1-NEXT: vpshufb %xmm3, %xmm9, %xmm3
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: avg_v64i8_3:
; AVX2: # BB#0:
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm5
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero,xmm5[8],zero,xmm5[9],zero,xmm5[10],zero,xmm5[11],zero,xmm5[12],zero,xmm5[13],zero,xmm5[14],zero,xmm5[15],zero
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm6
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero,xmm6[8],zero,xmm6[9],zero,xmm6[10],zero,xmm6[11],zero,xmm6[12],zero,xmm6[13],zero,xmm6[14],zero,xmm6[15],zero
; AVX2-NEXT: vpaddw %ymm6, %ymm4, %ymm4
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
; AVX2-NEXT: vpaddw %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm2
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
; AVX2-NEXT: vpaddw %ymm2, %ymm5, %ymm2
; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero
; AVX2-NEXT: vpaddw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
; AVX2-NEXT: vpsubw %ymm3, %ymm4, %ymm4
; AVX2-NEXT: vpsubw %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vpsubw %ymm3, %ymm2, %ymm2
; AVX2-NEXT: vpsubw %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX2-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX2-NEXT: vpsrlw $1, %ymm0, %ymm0
; AVX2-NEXT: vpsrlw $1, %ymm4, %ymm3
; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4
; AVX2-NEXT: vpshufb %xmm5, %xmm4, %xmm4
; AVX2-NEXT: vpshufb %xmm5, %xmm0, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
; AVX2-NEXT: vpshufb %xmm5, %xmm3, %xmm3
; AVX2-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX2-NEXT: retq
;
; AVX512F-LABEL: avg_v64i8_3:
; AVX512F: # BB#0:
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm4
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm5
; AVX512F-NEXT: vextracti128 $1, %ymm3, %xmm6
; AVX512F-NEXT: vpavgb %xmm6, %xmm4, %xmm4
; AVX512F-NEXT: vextracti128 $1, %ymm2, %xmm6
; AVX512F-NEXT: vpavgb %xmm6, %xmm5, %xmm5
; AVX512F-NEXT: vpavgb %xmm2, %xmm0, %xmm0
; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0
; AVX512F-NEXT: vpavgb %xmm3, %xmm1, %xmm1
; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: avg_v64i8_3:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: retq
%za = zext <64 x i8> %a to <64 x i16>
%zb = zext <64 x i8> %b to <64 x i16>
%add = add nuw nsw <64 x i16> %za, %zb
%add1 = add nuw nsw <64 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%lshr = lshr <64 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%res = trunc <64 x i16> %lshr to <64 x i8>
ret <64 x i8> %res
}

View File

@ -259,7 +259,7 @@ define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind
ret <4 x i64> %res
}
define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) {
define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_avg_epu8:
; X32: # BB#0:
; X32-NEXT: vpavgb %ymm1, %ymm0, %ymm0
@ -271,13 +271,17 @@ define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) {
; X64-NEXT: retq
%arg0 = bitcast <4 x i64> %a0 to <32 x i8>
%arg1 = bitcast <4 x i64> %a1 to <32 x i8>
%res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1)
%zext0 = zext <32 x i8> %arg0 to <32 x i16>
%zext1 = zext <32 x i8> %arg1 to <32 x i16>
%add = add <32 x i16> %zext0, %zext1
%add1 = add <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%res = trunc <32 x i16> %lshr to <32 x i8>
%bc = bitcast <32 x i8> %res to <4 x i64>
ret <4 x i64> %bc
}
declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) {
define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
; X32-LABEL: test_mm256_avg_epu16:
; X32: # BB#0:
; X32-NEXT: vpavgw %ymm1, %ymm0, %ymm0
@ -289,11 +293,15 @@ define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) {
; X64-NEXT: retq
%arg0 = bitcast <4 x i64> %a0 to <16 x i16>
%arg1 = bitcast <4 x i64> %a1 to <16 x i16>
%res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1)
%zext0 = zext <16 x i16> %arg0 to <16 x i32>
%zext1 = zext <16 x i16> %arg1 to <16 x i32>
%add = add <16 x i32> %zext0, %zext1
%add1 = add <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%res = trunc <16 x i32> %lshr to <16 x i16>
%bc = bitcast <16 x i16> %res to <4 x i64>
ret <4 x i64> %bc
}
declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
; X32-LABEL: test_mm256_blend_epi16:

View File

@ -514,3 +514,23 @@ define <8 x i32> @mm256_min_epu32(<8 x i32> %a0, <8 x i32> %a1) {
}
declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
define <32 x i8> @mm256_avg_epu8(<32 x i8> %a0, <32 x i8> %a1) {
; CHECK-LABEL: mm256_avg_epu8:
; CHECK: ## BB#0:
; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retl
%res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @mm256_avg_epu16(<16 x i16> %a0, <16 x i16> %a1) {
; CHECK-LABEL: mm256_avg_epu16:
; CHECK: ## BB#0:
; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0
; CHECK-NEXT: retl
%res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone

View File

@ -114,38 +114,6 @@ define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) {
declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
define <32 x i8> @test_x86_avx2_pavg_b(<32 x i8> %a0, <32 x i8> %a1) {
; AVX2-LABEL: test_x86_avx2_pavg_b:
; AVX2: ## BB#0:
; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe0,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pavg_b:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xc1]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
ret <32 x i8> %res
}
declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @test_x86_avx2_pavg_w(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_pavg_w:
; AVX2: ## BB#0:
; AVX2-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe3,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_pavg_w:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xc1]
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
ret <16 x i16> %res
}
declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) {
; AVX2-LABEL: test_x86_avx2_pmadd_wd:
; AVX2: ## BB#0:
@ -1340,18 +1308,18 @@ define <4 x i32> @test_x86_avx2_psrav_d_const(<4 x i32> %a0, <4 x i32> %a1) {
; AVX2: ## BB#0:
; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23]
; AVX2-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4
; AVX2-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4
; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4
; AVX2-NEXT: vpsravd LCPI88_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrav_d_const:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vmovdqa LCPI90_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
; AVX512VL-NEXT: vmovdqa LCPI88_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23]
; AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A]
; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4
; AVX512VL-NEXT: vpsravd LCPI90_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4
; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI88_0, kind: FK_Data_4
; AVX512VL-NEXT: vpsravd LCPI88_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A]
; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI88_1, kind: FK_Data_4
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> <i32 2, i32 9, i32 -12, i32 23>, <4 x i32> <i32 1, i32 18, i32 35, i32 52>)
ret <4 x i32> %res
@ -1377,18 +1345,18 @@ define <8 x i32> @test_x86_avx2_psrav_d_256_const(<8 x i32> %a0, <8 x i32> %a1)
; AVX2: ## BB#0:
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
; AVX2-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI92_0, kind: FK_Data_4
; AVX2-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI92_1, kind: FK_Data_4
; AVX2-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4
; AVX2-NEXT: vpsravd LCPI90_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
; AVX2-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const:
; AVX512VL: ## BB#0:
; AVX512VL-NEXT: vmovdqa LCPI92_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
; AVX512VL-NEXT: vmovdqa LCPI90_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51]
; AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A]
; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI92_0, kind: FK_Data_4
; AVX512VL-NEXT: vpsravd LCPI92_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI92_1, kind: FK_Data_4
; AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI90_0, kind: FK_Data_4
; AVX512VL-NEXT: vpsravd LCPI90_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A]
; AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI90_1, kind: FK_Data_4
; AVX512VL-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> <i32 2, i32 9, i32 -12, i32 23, i32 -26, i32 37, i32 -40, i32 51>, <8 x i32> <i32 1, i32 18, i32 35, i32 52, i32 69, i32 15, i32 32, i32 49>)
ret <8 x i32> %res

View File

@ -3610,3 +3610,54 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
}
declare i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16>, <32 x i16>, i32, i32) nounwind readnone
declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
define <64 x i8>@mm512_avg_epu8(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; AVX512BW-LABEL: mm512_avg_epu8:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vpaddb %zmm3, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: mm512_avg_epu8:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpavgb %zmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovq {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vpaddb %zmm3, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
%res1 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
%res2 = add <64 x i8> %res, %res1
ret <64 x i8> %res2
}
declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
define <32 x i16>@mm512_avg_epu16(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: mm512_avg_epu16:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm3
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: mm512_avg_epu16:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: vpavgw %zmm1, %zmm0, %zmm3
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
%res2 = add <32 x i16> %res, %res1
ret <32 x i16> %res2
}

View File

@ -1073,56 +1073,6 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32
ret <32 x i16> %res2
}
declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
define <64 x i8>@test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_b_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovq %rdi, %k1
; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vpavgb %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_b_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k0
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1
; AVX512F-32-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vpavgb %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: vpaddb %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3)
%res1 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
%res2 = add <64 x i8> %res, %res1
ret <64 x i8> %res2
}
declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
define <32 x i16>@test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
; AVX512BW-LABEL: test_int_x86_avx512_mask_pavg_w_512:
; AVX512BW: ## BB#0:
; AVX512BW-NEXT: kmovd %edi, %k1
; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512BW-NEXT: vpavgw %zmm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512BW-NEXT: retq
;
; AVX512F-32-LABEL: test_int_x86_avx512_mask_pavg_w_512:
; AVX512F-32: # BB#0:
; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1
; AVX512F-32-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1}
; AVX512F-32-NEXT: vpavgw %zmm1, %zmm0, %zmm0
; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0
; AVX512F-32-NEXT: retl
%res = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3)
%res1 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
%res2 = add <32 x i16> %res, %res1
ret <32 x i16> %res2
}
declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>)
define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1) {

View File

@ -3445,3 +3445,67 @@ define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
}
declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
define <16 x i8>@mm_mask_avg_epu8(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
; CHECK-LABEL: mm_mask_avg_epu8:
; CHECK: ## BB#0:
; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1]
; CHECK-NEXT: vpaddb %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
%res2 = add <16 x i8> %res, %res1
ret <16 x i8> %res2
}
declare <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
define <32 x i8>@mm256_mask_avg_epu8(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
; CHECK-LABEL: mm256_mask_avg_epu8:
; CHECK: ## BB#0:
; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe0,0xd1]
; CHECK-NEXT: vpaddb %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
%res2 = add <32 x i8> %res, %res1
ret <32 x i8> %res2
}
declare <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
define <8 x i16>@mm_mask_avg_epu16(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: mm_mask_avg_epu16:
; CHECK: ## BB#0:
; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1]
; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
ret <8 x i16> %res2
}
declare <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <16 x i16>@mm256_mask_avg_epu16(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: mm256_mask_avg_epu16:
; CHECK: ## BB#0:
; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xd9]
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1]
; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
ret <16 x i16> %res2
}
declare <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)

View File

@ -1900,70 +1900,6 @@ define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16
ret <16 x i16> %res2
}
declare <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_mask_pavg_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pavg_b_128:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1]
; CHECK-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1]
; CHECK-NEXT: vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
%res1 = call <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
%res2 = add <16 x i8> %res, %res1
ret <16 x i8> %res2
}
declare <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
define <32 x i8>@test_int_x86_avx512_mask_pavg_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pavg_b_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe0,0xd1]
; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xc1]
; CHECK-NEXT: vpaddb %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3)
%res1 = call <32 x i8> @llvm.x86.avx512.mask.pavg.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
%res2 = add <32 x i8> %res, %res1
ret <32 x i8> %res2
}
declare <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
define <8 x i16>@test_int_x86_avx512_mask_pavg_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pavg_w_128:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1]
; CHECK-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1]
; CHECK-NEXT: vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
%res1 = call <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
%res2 = add <8 x i16> %res, %res1
ret <8 x i16> %res2
}
declare <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
define <16 x i16>@test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_pavg_w_256:
; CHECK: ## BB#0:
; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1]
; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xc1]
; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
; CHECK-NEXT: retq ## encoding: [0xc3]
%res = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
%res1 = call <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
%res2 = add <16 x i16> %res, %res1
ret <16 x i16> %res2
}
declare <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8>, <16 x i8>, i16)
define <16 x i8>@test_int_x86_avx512_mask_pabs_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {

View File

@ -252,11 +252,15 @@ define <2 x i64> @test_mm_avg_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <16 x i8>
%arg1 = bitcast <2 x i64> %a1 to <16 x i8>
%res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1)
%zext0 = zext <16 x i8> %arg0 to <16 x i16>
%zext1 = zext <16 x i8> %arg1 to <16 x i16>
%add = add <16 x i16> %zext0, %zext1
%add1 = add <16 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%lshr = lshr <16 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%res = trunc <16 x i16> %lshr to <16 x i8>
%bc = bitcast <16 x i8> %res to <2 x i64>
ret <2 x i64> %bc
}
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) nounwind readnone
define <2 x i64> @test_mm_avg_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X32-LABEL: test_mm_avg_epu16:
@ -270,11 +274,15 @@ define <2 x i64> @test_mm_avg_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind {
; X64-NEXT: retq
%arg0 = bitcast <2 x i64> %a0 to <8 x i16>
%arg1 = bitcast <2 x i64> %a1 to <8 x i16>
%res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %arg0, <8 x i16> %arg1)
%zext0 = zext <8 x i16> %arg0 to <8 x i32>
%zext1 = zext <8 x i16> %arg1 to <8 x i32>
%add = add <8 x i32> %zext0, %zext1
%add1 = add <8 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%lshr = lshr <8 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%res = trunc <8 x i32> %lshr to <8 x i16>
%bc = bitcast <8 x i16> %res to <2 x i64>
ret <2 x i64> %bc
}
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <2 x i64> @test_mm_bslli_si128(<2 x i64> %a0) nounwind {
; X32-LABEL: test_mm_bslli_si128:

View File

@ -282,5 +282,24 @@ define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) {
}
declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone
define <16 x i8> @mm_avg_epu8(<16 x i8> %a0, <16 x i8> %a1) {
; CHECK-LABEL: mm_avg_epu8:
; CHECK: ## BB#0:
; CHECK-NEXT: pavgb %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @mm_avg_epu16(<8 x i16> %a0, <8 x i16> %a1) {
; CHECK-LABEL: mm_avg_epu16:
; CHECK: ## BB#0:
; CHECK-NEXT: pavgw %xmm1, %xmm0
; CHECK-NEXT: retl
%res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone

View File

@ -809,48 +809,6 @@ define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) {
declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) {
; SSE-LABEL: test_x86_sse2_pavg_b:
; SSE: ## BB#0:
; SSE-NEXT: pavgb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe0,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_pavg_b:
; AVX2: ## BB#0:
; AVX2-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe0,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_pavg_b:
; SKX: ## BB#0:
; SKX-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
ret <16 x i8> %res
}
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_pavg_w:
; SSE: ## BB#0:
; SSE-NEXT: pavgw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe3,0xc1]
; SSE-NEXT: retl ## encoding: [0xc3]
;
; AVX2-LABEL: test_x86_sse2_pavg_w:
; AVX2: ## BB#0:
; AVX2-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe3,0xc1]
; AVX2-NEXT: retl ## encoding: [0xc3]
;
; SKX-LABEL: test_x86_sse2_pavg_w:
; SKX: ## BB#0:
; SKX-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1]
; SKX-NEXT: retl ## encoding: [0xc3]
%res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
ret <8 x i16> %res
}
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) {
; SSE-LABEL: test_x86_sse2_pmadd_wd:
; SSE: ## BB#0:

View File

@ -3978,12 +3978,21 @@ define <16 x i8> @test_pavgb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; ZNVER1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpavgb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %1, <16 x i8> %2)
ret <16 x i8> %3
%1 = zext <16 x i8> %a0 to <16 x i16>
%2 = zext <16 x i8> %a1 to <16 x i16>
%3 = add <16 x i16> %1, %2
%4 = add <16 x i16> %3, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%5 = lshr <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%6 = trunc <16 x i16> %5 to <16 x i8>
%7 = load <16 x i8>, <16 x i8> *%a2, align 16
%8 = zext <16 x i8> %6 to <16 x i16>
%9 = zext <16 x i8> %7 to <16 x i16>
%10 = add <16 x i16> %8, %9
%11 = add <16 x i16> %10, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%12 = lshr <16 x i16> %11, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%13 = trunc <16 x i16> %12 to <16 x i8>
ret <16 x i8> %13
}
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) nounwind readnone
define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; GENERIC-LABEL: test_pavgw:
@ -4037,12 +4046,21 @@ define <8 x i16> @test_pavgw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; ZNVER1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpavgw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %1, <8 x i16> %2)
ret <8 x i16> %3
%1 = zext <8 x i16> %a0 to <8 x i32>
%2 = zext <8 x i16> %a1 to <8 x i32>
%3 = add <8 x i32> %1, %2
%4 = add <8 x i32> %3, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%5 = lshr <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%6 = trunc <8 x i32> %5 to <8 x i16>
%7 = load <8 x i16>, <8 x i16> *%a2, align 16
%8 = zext <8 x i16> %6 to <8 x i32>
%9 = zext <8 x i16> %7 to <8 x i32>
%10 = add <8 x i32> %8, %9
%11 = add <8 x i32> %10, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%12 = lshr <8 x i32> %11, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%13 = trunc <8 x i32> %12 to <8 x i16>
ret <8 x i16> %13
}
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @test_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; GENERIC-LABEL: test_pcmpeqb:

View File

@ -275,19 +275,27 @@ define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_pavgb
;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2
%2 = zext <16 x i8> %a0 to <16 x i16>
%3 = zext <16 x i8> %a1 to <16 x i16>
%4 = add <16 x i16> %2, %3
%5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%7 = trunc <16 x i16> %6 to <16 x i8>
ret <16 x i8> %7
}
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pavgw
;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
%2 = zext <8 x i16> %a0 to <8 x i32>
%3 = zext <8 x i16> %a1 to <8 x i32>
%4 = add <8 x i32> %2, %3
%5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%7 = trunc <8 x i32> %6 to <8 x i16>
ret <8 x i16> %7
}
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
;CHECK-LABEL: stack_fold_pblendvb

View File

@ -234,19 +234,27 @@ define <32 x i8> @stack_fold_pavgb(<32 x i8> %a0, <32 x i8> %a1) {
;CHECK-LABEL: stack_fold_pavgb
;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1)
ret <32 x i8> %2
%2 = zext <32 x i8> %a0 to <32 x i16>
%3 = zext <32 x i8> %a1 to <32 x i16>
%4 = add <32 x i16> %2, %3
%5 = add <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%6 = lshr <32 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%7 = trunc <32 x i16> %6 to <32 x i8>
ret <32 x i8> %7
}
declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
define <16 x i16> @stack_fold_pavgw(<16 x i16> %a0, <16 x i16> %a1) {
;CHECK-LABEL: stack_fold_pavgw
;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1)
ret <16 x i16> %2
%2 = zext <16 x i16> %a0 to <16 x i32>
%3 = zext <16 x i16> %a1 to <16 x i32>
%4 = add <16 x i32> %2, %3
%5 = add <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%6 = lshr <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%7 = trunc <16 x i32> %6 to <16 x i16>
ret <16 x i16> %7
}
declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i32> @stack_fold_pblendd(<4 x i32> %a0, <4 x i32> %a1) {
;CHECK-LABEL: stack_fold_pblendd

View File

@ -70,52 +70,88 @@ define <64 x i8> @stack_fold_pavgb(<64 x i8> %a0, <64 x i8> %a1) {
;CHECK-LABEL: stack_fold_pavgb
;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> undef, i64 -1)
ret <64 x i8> %2
%2 = zext <64 x i8> %a0 to <64 x i16>
%3 = zext <64 x i8> %a1 to <64 x i16>
%4 = add <64 x i16> %2, %3
%5 = add <64 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%6 = lshr <64 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%7 = trunc <64 x i16> %6 to <64 x i8>
ret <64 x i8> %7
}
declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readnone
define <64 x i8> @stack_fold_pavgb_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
;CHECK-LABEL: stack_fold_pavgb_mask
;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <64 x i8>, <64 x i8>* %passthru
%3 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> %2, i64 %mask)
ret <64 x i8> %3
%3 = zext <64 x i8> %a0 to <64 x i16>
%4 = zext <64 x i8> %a1 to <64 x i16>
%5 = add <64 x i16> %3, %4
%6 = add <64 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%7 = lshr <64 x i16> %6, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%8 = trunc <64 x i16> %7 to <64 x i8>
%9 = bitcast i64 %mask to <64 x i1>
%10 = select <64 x i1> %9, <64 x i8> %8, <64 x i8> %2
ret <64 x i8> %10
}
define <64 x i8> @stack_fold_pavgb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
;CHECK-LABEL: stack_fold_pavgb_maskz
;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> zeroinitializer, i64 %mask)
ret <64 x i8> %2
%2 = zext <64 x i8> %a0 to <64 x i16>
%3 = zext <64 x i8> %a1 to <64 x i16>
%4 = add <64 x i16> %2, %3
%5 = add <64 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%6 = lshr <64 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%7 = trunc <64 x i16> %6 to <64 x i8>
%8 = bitcast i64 %mask to <64 x i1>
%9 = select <64 x i1> %8, <64 x i8> %7, <64 x i8> zeroinitializer
ret <64 x i8> %9
}
define <32 x i16> @stack_fold_pavgw(<32 x i16> %a0, <32 x i16> %a1) {
;CHECK-LABEL: stack_fold_pavgw
;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> undef, i32 -1)
ret <32 x i16> %2
%2 = zext <32 x i16> %a0 to <32 x i32>
%3 = zext <32 x i16> %a1 to <32 x i32>
%4 = add <32 x i32> %2, %3
%5 = add <32 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%6 = lshr <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%7 = trunc <32 x i32> %6 to <32 x i16>
ret <32 x i16> %7
}
declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) nounwind readnone
define <32 x i16> @stack_fold_pavgw_mask(<32 x i16>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
;CHECK-LABEL: stack_fold_pavgw_mask
;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = load <32 x i16>, <32 x i16>* %passthru
%3 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> %2, i32 %mask)
ret <32 x i16> %3
%3 = zext <32 x i16> %a0 to <32 x i32>
%4 = zext <32 x i16> %a1 to <32 x i32>
%5 = add <32 x i32> %3, %4
%6 = add <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%7 = lshr <32 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%8 = trunc <32 x i32> %7 to <32 x i16>
%9 = bitcast i32 %mask to <32 x i1>
%10 = select <32 x i1> %9, <32 x i16> %8, <32 x i16> %2
ret <32 x i16> %10
}
define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
;CHECK-LABEL: stack_fold_pavgw_maskz
;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> zeroinitializer, i32 %mask)
ret <32 x i16> %2
%2 = zext <32 x i16> %a0 to <32 x i32>
%3 = zext <32 x i16> %a1 to <32 x i32>
%4 = add <32 x i32> %2, %3
%5 = add <32 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%6 = lshr <32 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%7 = trunc <32 x i32> %6 to <32 x i16>
%8 = bitcast i32 %mask to <32 x i1>
%9 = select <32 x i1> %8, <32 x i16> %7, <32 x i16> zeroinitializer
ret <32 x i16> %9
}
define <4 x i32> @stack_fold_extracti32x4(<16 x i32> %a0, <16 x i32> %a1) {

View File

@ -49,37 +49,53 @@ define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_pavgb
;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2
%2 = zext <16 x i8> %a0 to <16 x i16>
%3 = zext <16 x i8> %a1 to <16 x i16>
%4 = add <16 x i16> %2, %3
%5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%7 = trunc <16 x i16> %6 to <16 x i8>
ret <16 x i8> %7
}
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <32 x i8> @stack_fold_pavgb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
;CHECK-LABEL: stack_fold_pavgb_ymm
;CHECK: vpavgb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1)
ret <32 x i8> %2
%2 = zext <32 x i8> %a0 to <32 x i16>
%3 = zext <32 x i8> %a1 to <32 x i16>
%4 = add <32 x i16> %2, %3
%5 = add <32 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%6 = lshr <32 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%7 = trunc <32 x i16> %6 to <32 x i8>
ret <32 x i8> %7
}
declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pavgw
;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
%2 = zext <8 x i16> %a0 to <8 x i32>
%3 = zext <8 x i16> %a1 to <8 x i32>
%4 = add <8 x i32> %2, %3
%5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%7 = trunc <8 x i32> %6 to <8 x i16>
ret <8 x i16> %7
}
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i16> @stack_fold_pavgw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
;CHECK-LABEL: stack_fold_pavgw_ymm
;CHECK: vpavgw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1)
ret <16 x i16> %2
%2 = zext <16 x i16> %a0 to <16 x i32>
%3 = zext <16 x i16> %a1 to <16 x i32>
%4 = add <16 x i32> %2, %3
%5 = add <16 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%6 = lshr <16 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%7 = trunc <16 x i32> %6 to <16 x i16>
ret <16 x i16> %7
}
declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
define <4 x i32> @stack_fold_vpconflictd(<4 x i32> %a0) {
;CHECK-LABEL: stack_fold_vpconflictd

View File

@ -302,19 +302,27 @@ define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
;CHECK-LABEL: stack_fold_pavgb
;CHECK: pavgb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1)
ret <16 x i8> %2
%2 = zext <16 x i8> %a0 to <16 x i16>
%3 = zext <16 x i8> %a1 to <16 x i16>
%4 = add <16 x i16> %2, %3
%5 = add <16 x i16> %4, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%6 = lshr <16 x i16> %5, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
%7 = trunc <16 x i16> %6 to <16 x i8>
ret <16 x i8> %7
}
declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
;CHECK-LABEL: stack_fold_pavgw
;CHECK: pavgw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
ret <8 x i16> %2
%2 = zext <8 x i16> %a0 to <8 x i32>
%3 = zext <8 x i16> %a1 to <8 x i32>
%4 = add <8 x i32> %2, %3
%5 = add <8 x i32> %4, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%6 = lshr <8 x i32> %5, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%7 = trunc <8 x i32> %6 to <8 x i16>
ret <8 x i16> %7
}
declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
;CHECK-LABEL: stack_fold_pblendvb