From 1e3f8dcb8640193ece0d0004fef8fc16af83bafc Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 29 May 2018 05:22:05 +0000 Subject: [PATCH] [X86] Remove masked vpermi2var/vpermt2var intrinsics and autoupgrade. We have unmasked intrinsics now and wrap them with a select. This is a net reduction of 36 intrinsics from before the unmasked intrinsics were added. llvm-svn: 333388 --- include/llvm/IR/IntrinsicsX86.td | 270 ----------------- lib/IR/AutoUpgrade.cpp | 64 ++++ lib/Target/X86/X86ISelLowering.cpp | 38 --- lib/Target/X86/X86IntrinsicsInfo.h | 113 +------ test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 159 ++++++++++ test/CodeGen/X86/avx512-intrinsics.ll | 107 ++++--- .../X86/avx512bw-intrinsics-upgrade.ll | 78 +++++ test/CodeGen/X86/avx512bw-intrinsics.ll | 78 ++--- .../X86/avx512bwvl-intrinsics-upgrade.ll | 102 +++++++ test/CodeGen/X86/avx512bwvl-intrinsics.ll | 72 ++--- .../X86/avx512vbmi-intrinsics-upgrade.ll | 56 ++++ test/CodeGen/X86/avx512vbmi-intrinsics.ll | 54 ++-- .../X86/avx512vbmivl-intrinsics-upgrade.ll | 112 +++++++ test/CodeGen/X86/avx512vbmivl-intrinsics.ll | 96 +++--- .../X86/avx512vl-intrinsics-upgrade.ll | 284 ++++++++++++++++++ test/CodeGen/X86/avx512vl-intrinsics.ll | 264 +++++++++++++--- .../X86/vector-shuffle-combining-avx512bw.ll | 28 +- .../vector-shuffle-combining-avx512bwvl.ll | 4 +- .../vector-shuffle-combining-avx512vbmi.ll | 4 +- 19 files changed, 1315 insertions(+), 668 deletions(-) diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 8e37afe93cb..6482f05b938 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -1012,276 +1012,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". GCCBuiltin<"__builtin_ia32_vpermilvarps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_vpermi2var_d_128 : // FIXME: Remove - Intrinsic<[llvm_v4i32_ty], - [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_d_256 : // FIXME: Remove - Intrinsic<[llvm_v8i32_ty], - [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_d_512 : // FIXME: Remove - Intrinsic<[llvm_v16i32_ty], - [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_hi_128 : // FIXME: Remove - Intrinsic<[llvm_v8i16_ty], - [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_hi_256 : // FIXME: Remove - Intrinsic<[llvm_v16i16_ty], - [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_hi_512 : // FIXME: Remove - Intrinsic<[llvm_v32i16_ty], - [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_pd_128 : // FIXME: Remove - Intrinsic<[llvm_v2f64_ty], - [llvm_v2f64_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_pd_256 : // FIXME: Remove - Intrinsic<[llvm_v4f64_ty], - [llvm_v4f64_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_pd_512 : // FIXME: Remove - Intrinsic<[llvm_v8f64_ty], - [llvm_v8f64_ty, llvm_v8i64_ty, llvm_v8f64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_ps_128 : // FIXME: Remove - Intrinsic<[llvm_v4f32_ty], - [llvm_v4f32_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_ps_256 : // FIXME: Remove - Intrinsic<[llvm_v8f32_ty], - [llvm_v8f32_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_ps_512 : // FIXME: Remove - Intrinsic<[llvm_v16f32_ty], - [llvm_v16f32_ty, llvm_v16i32_ty, llvm_v16f32_ty, llvm_i16_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_q_128 : // FIXME: Remove - Intrinsic<[llvm_v2i64_ty], - [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_q_256 : // FIXME: Remove - Intrinsic<[llvm_v4i64_ty], - [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_q_512 : // FIXME: Remove - Intrinsic<[llvm_v8i64_ty], - [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_d_512: // FIXME: Remove - Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, - llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_q_512: // FIXME: Remove - Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, - llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_ps_512: // FIXME: Remove - Intrinsic<[llvm_v16f32_ty], [llvm_v16i32_ty, - llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_pd_512: // FIXME: Remove - Intrinsic<[llvm_v8f64_ty], [llvm_v8i64_ty, - llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_d_128 : // FIXME: Remove - Intrinsic<[llvm_v4i32_ty], - [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_d_128 : // FIXME: Remove - Intrinsic<[llvm_v4i32_ty], - [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_d_256 : // FIXME: Remove - Intrinsic<[llvm_v8i32_ty], - [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_d_256 : // FIXME: Remove - Intrinsic<[llvm_v8i32_ty], - [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_d_512 : // FIXME: Remove - Intrinsic<[llvm_v16i32_ty], - [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_hi_128 : // FIXME: Remove - Intrinsic<[llvm_v8i16_ty], - [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_hi_128 : // FIXME: Remove - Intrinsic<[llvm_v8i16_ty], - [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_hi_256 : // FIXME: Remove - Intrinsic<[llvm_v16i16_ty], - [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_hi_256 : // FIXME: Remove - Intrinsic<[llvm_v16i16_ty], - [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty, llvm_i16_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_hi_512 : // FIXME: Remove - Intrinsic<[llvm_v32i16_ty], - [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_hi_512 : // FIXME: Remove - Intrinsic<[llvm_v32i16_ty], - [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty, llvm_i32_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_pd_128 : // FIXME: Remove - Intrinsic<[llvm_v2f64_ty], - [llvm_v2i64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_pd_128 : // FIXME: Remove - Intrinsic<[llvm_v2f64_ty], - [llvm_v2i64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_pd_256 : // FIXME: Remove - Intrinsic<[llvm_v4f64_ty], - [llvm_v4i64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_pd_256 : // FIXME: Remove - Intrinsic<[llvm_v4f64_ty], - [llvm_v4i64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_pd_512 : // FIXME: Remove - Intrinsic<[llvm_v8f64_ty], - [llvm_v8i64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_ps_128 : // FIXME: Remove - Intrinsic<[llvm_v4f32_ty], - [llvm_v4i32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_ps_128 : // FIXME: Remove - Intrinsic<[llvm_v4f32_ty], - [llvm_v4i32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_ps_256 : // FIXME: Remove - Intrinsic<[llvm_v8f32_ty], - [llvm_v8i32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_ps_256 : // FIXME: Remove - Intrinsic<[llvm_v8f32_ty], - [llvm_v8i32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_ps_512 : // FIXME: Remove - Intrinsic<[llvm_v16f32_ty], - [llvm_v16i32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_q_128 : // FIXME: Remove - Intrinsic<[llvm_v2i64_ty], - [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_q_128 : // FIXME: Remove - Intrinsic<[llvm_v2i64_ty], - [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_q_256 : // FIXME: Remove - Intrinsic<[llvm_v4i64_ty], - [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_q_256 : // FIXME: Remove - Intrinsic<[llvm_v4i64_ty], - [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_q_512 : // FIXME: Remove - Intrinsic<[llvm_v8i64_ty], - [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_qi_128 : // FIXME: Remove - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, - llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_qi_128 : // FIXME: Remove - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, - llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_qi_128 : // FIXME: Remove - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, - llvm_v16i8_ty, llvm_v16i8_ty, llvm_i16_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_qi_256 : // FIXME: Remove - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_qi_256 : // FIXME: Remove - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_qi_256 : // FIXME: Remove - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermi2var_qi_512 : // FIXME: Remove - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, - llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], - [IntrNoMem]>; - - def int_x86_avx512_mask_vpermt2var_qi_512 : // FIXME: Remove - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, - llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], - [IntrNoMem]>; - - def int_x86_avx512_maskz_vpermt2var_qi_512 : // FIXME: Remove - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, - llvm_v64i8_ty, llvm_v64i8_ty, llvm_i64_ty], - [IntrNoMem]>; - - - - - def int_x86_avx512_vpermi2var_d_128 : GCCBuiltin<"__builtin_ia32_vpermi2vard128">, Intrinsic<[llvm_v4i32_ty], diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp index bd4638f147a..2455d3bb3d9 100644 --- a/lib/IR/AutoUpgrade.cpp +++ b/lib/IR/AutoUpgrade.cpp @@ -267,6 +267,9 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx512.maskz.pternlog.") || // Added in 7.0 Name.startswith("avx512.mask.vpmadd52") || // Added in 7.0 Name.startswith("avx512.maskz.vpmadd52") || // Added in 7.0 + Name.startswith("avx512.mask.vpermi2var.") || // Added in 7.0 + Name.startswith("avx512.mask.vpermt2var.") || // Added in 7.0 + Name.startswith("avx512.maskz.vpermt2var.") || // Added in 7.0 Name == "sse.cvtsi2ss" || // Added in 7.0 Name == "sse.cvtsi642ss" || // Added in 7.0 Name == "sse2.cvtsi2sd" || // Added in 7.0 @@ -2599,6 +2602,67 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); + } else if (IsX86 && (Name.startswith("avx512.mask.vpermi2var.") || + Name.startswith("avx512.mask.vpermt2var.") || + Name.startswith("avx512.maskz.vpermt2var."))) { + bool ZeroMask = Name[11] == 'z'; + bool IndexForm = Name[17] == 'i'; + unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits(); + unsigned EltWidth = CI->getType()->getScalarSizeInBits(); + bool IsFloat = CI->getType()->isFPOrFPVectorTy(); + Intrinsic::ID IID; + if (VecWidth == 128 && EltWidth == 32 && IsFloat) + IID = Intrinsic::x86_avx512_vpermi2var_ps_128; + else if (VecWidth == 128 && EltWidth == 32 && !IsFloat) + IID = Intrinsic::x86_avx512_vpermi2var_d_128; + else if (VecWidth == 128 && EltWidth == 64 && IsFloat) + IID = Intrinsic::x86_avx512_vpermi2var_pd_128; + else if (VecWidth == 128 && EltWidth == 64 && !IsFloat) + IID = Intrinsic::x86_avx512_vpermi2var_q_128; + else if (VecWidth == 256 && EltWidth == 32 && IsFloat) + IID = Intrinsic::x86_avx512_vpermi2var_ps_256; + else if (VecWidth == 256 && EltWidth == 32 && !IsFloat) + IID = Intrinsic::x86_avx512_vpermi2var_d_256; + else if (VecWidth == 256 && EltWidth == 64 && IsFloat) + IID = Intrinsic::x86_avx512_vpermi2var_pd_256; + else if (VecWidth == 256 && EltWidth == 64 && !IsFloat) + IID = Intrinsic::x86_avx512_vpermi2var_q_256; + else if (VecWidth == 512 && EltWidth == 32 && IsFloat) + IID = Intrinsic::x86_avx512_vpermi2var_ps_512; + else if (VecWidth == 512 && EltWidth == 32 && !IsFloat) + IID = Intrinsic::x86_avx512_vpermi2var_d_512; + else if (VecWidth == 512 && EltWidth == 64 && IsFloat) + IID = Intrinsic::x86_avx512_vpermi2var_pd_512; + else if (VecWidth == 512 && EltWidth == 64 && !IsFloat) + IID = Intrinsic::x86_avx512_vpermi2var_q_512; + else if (VecWidth == 128 && EltWidth == 16) + IID = Intrinsic::x86_avx512_vpermi2var_hi_128; + else if (VecWidth == 256 && EltWidth == 16) + IID = Intrinsic::x86_avx512_vpermi2var_hi_256; + else if (VecWidth == 512 && EltWidth == 16) + IID = Intrinsic::x86_avx512_vpermi2var_hi_512; + else if (VecWidth == 128 && EltWidth == 8) + IID = Intrinsic::x86_avx512_vpermi2var_qi_128; + else if (VecWidth == 256 && EltWidth == 8) + IID = Intrinsic::x86_avx512_vpermi2var_qi_256; + else if (VecWidth == 512 && EltWidth == 8) + IID = Intrinsic::x86_avx512_vpermi2var_qi_512; + else + llvm_unreachable("Unexpected intrinsic"); + + Value *Args[] = { CI->getArgOperand(0) , CI->getArgOperand(1), + CI->getArgOperand(2) }; + + // If this isn't index form we need to swap operand 0 and 1. + if (!IndexForm) + std::swap(Args[0], Args[1]); + + Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), + Args); + Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) + : Builder.CreateBitCast(CI->getArgOperand(1), + CI->getType()); + Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); } else if (IsX86 && Name.startswith("avx512.mask.") && upgradeAVX512MaskToSelect(Name, Builder, *CI, Rep)) { // Rep will be updated by the call in the condition. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 09bd7bf274a..15eeddf0faa 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -20515,44 +20515,6 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // Swap Src1 and Src2 in the node creation return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1); } - case VPERM_3OP_MASKZ: - case VPERM_3OP_MASK:{ - // Src2 is the PassThru - SDValue Src1 = Op.getOperand(1); - SDValue Src2 = Op.getOperand(2); - SDValue Src3 = Op.getOperand(3); - SDValue Mask = Op.getOperand(4); - MVT VT = Op.getSimpleValueType(); - - // set PassThru element - SDValue PassThru; - if (IntrData->Type == VPERM_3OP_MASKZ) - PassThru = getZeroVector(VT, Subtarget, DAG, dl); - else - PassThru = DAG.getBitcast(VT, Src2); - - // Swap Src1 and Src2 in the node creation - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, - dl, Op.getValueType(), - Src2, Src1, Src3), - Mask, PassThru, Subtarget, DAG); - } - case VPERMI_3OP_MASK:{ - // Src2 is the PassThru - SDValue Src1 = Op.getOperand(1); - SDValue Src2 = Op.getOperand(2); - SDValue Src3 = Op.getOperand(3); - SDValue Mask = Op.getOperand(4); - MVT VT = Op.getSimpleValueType(); - - // set PassThru element - SDValue PassThru = DAG.getBitcast(VT, Src2); - - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, - dl, Op.getValueType(), - Src1, Src2, Src3), - Mask, PassThru, Subtarget, DAG); - } case FMA_OP_MASK3: case FMA_OP_MASKZ: case FMA_OP_MASK: { diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 3a79af9a1f5..7b5df0fd325 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -30,8 +30,7 @@ enum IntrinsicType : uint16_t { INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_IMM8_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3, - IFMA_OP, - VPERM_2OP, VPERMI_3OP_MASK, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK, + IFMA_OP, VPERM_2OP, INTR_TYPE_SCALAR_MASK, INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, @@ -1061,79 +1060,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_256, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0), X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_512, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_512, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_128, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_256, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_hi_512, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_128, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_256, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_pd_512, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_128, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_256, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_ps_512, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_128, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_256, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_q_512, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_128, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_256, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_qi_512, VPERMI_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_d_512, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_hi_512, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_pd_512, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_ps_512, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_q_512, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_128, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_256, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_512, VPERM_3OP_MASK, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpshld_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), X86_INTRINSIC_DATA(avx512_mask_vpshld_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), X86_INTRINSIC_DATA(avx512_mask_vpshld_d_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), @@ -1277,43 +1203,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_256, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0), X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_512, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_128, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_256, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_512, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_128, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_256, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_hi_512, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_128, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_256, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_pd_512, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_128, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_256, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_ps_512, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_128, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_256, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_q_512, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_128, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_256, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_512, VPERM_3OP_MASKZ, - X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_512, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 1656d3785ca..513d6d56b6c 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -4299,3 +4299,162 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6 %res2 = add <8 x i64> %res, %res1 ret <8 x i64> %res2 } + +declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 +; CHECK-NEXT: vpermi2d (%rdi), %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0 +; CHECK-NEXT: retq + %x2 = load <16 x i32>, <16 x i32>* %x2p + %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovapd %zmm0, %zmm3 +; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vaddpd %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) + %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) + %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + +declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vpaddq %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm2 +; CHECK-NEXT: vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpermt2d %zmm1, %zmm0, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq + %x2 = load <16 x i32>, <16 x i32>* %x2p + %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x1, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} + +declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %esi, %k1 +; CHECK-NEXT: vmovapd %zmm1, %zmm2 +; CHECK-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1 +; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: retq + %x2s = load double, double* %x2ptr + %x2ins = insertelement <8 x double> undef, double %x2s, i32 0 + %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer + %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) + %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x1, i8 -1) + %res2 = fadd <8 x double> %res, %res1 + ret <8 x double> %res2 +} + +declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z} +; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) + %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) + %res2 = fadd <16 x float> %res, %res1 + ret <16 x float> %res2 +} + + +declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) + +define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 +; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z} +; CHECK-NEXT: vpaddq %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + %res2 = add <8 x i64> %res, %res1 + ret <8 x i64> %res2 +} + +declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) + +define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 +; CHECK-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vpaddd %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) + %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + %res2 = add <16 x i32> %res, %res1 + ret <16 x i32> %res2 +} diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index aeeb5905ad8..52e67ed6e98 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -1955,7 +1955,7 @@ define <2 x double> @test_x86_avx512__mm_cvt_roundu64_sd(<2 x double> %a, i64 %b } declare <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double>, i64, i32) nounwind readnone -declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>) define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512: @@ -1967,65 +1967,75 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 ; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2p - %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1) - %res2 = add <16 x i32> %res, %res1 + %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 + %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4) + %res2 = add <16 x i32> %3, %4 ret <16 x i32> %res2 } -declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) +declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>) define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovapd %zmm0, %zmm3 ; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vaddpd %zmm3, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) - %res2 = fadd <8 x double> %res, %res1 + %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) + %2 = bitcast <8 x i64> %x1 to <8 x double> + %3 = bitcast i8 %x3 to <8 x i1> + %4 = select <8 x i1> %3, <8 x double> %1, <8 x double> %2 + %5 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) + %6 = bitcast <8 x i64> %x1 to <8 x double> + %res2 = fadd <8 x double> %4, %5 ret <8 x double> %res2 } -declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) +declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>) define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm0, %zmm3 ; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) - %res2 = fadd <16 x float> %res, %res1 + %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) + %2 = bitcast <16 x i32> %x1 to <16 x float> + %3 = bitcast i16 %x3 to <16 x i1> + %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2 + %5 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) + %6 = bitcast <16 x i32> %x1 to <16 x float> + %res2 = fadd <16 x float> %4, %5 ret <16 x float> %res2 } -declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>) define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vpaddq %zmm3, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 + %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1 + %4 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + %res2 = add <8 x i64> %3, %4 ret <8 x i64> %res2 } -declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512: ; CHECK: ## %bb.0: @@ -2036,14 +2046,14 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 ; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm0 ; CHECK-NEXT: retq %x2 = load <16 x i32>, <16 x i32>* %x2p - %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x1, i16 -1) - %res2 = add <16 x i32> %res, %res1 + %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x1) + %res2 = add <16 x i32> %3, %4 ret <16 x i32> %res2 } -declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8) - define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512: ; CHECK: ## %bb.0: @@ -2056,61 +2066,62 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, < %x2s = load double, double* %x2ptr %x2ins = insertelement <8 x double> undef, double %x2s, i32 0 %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer - %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x1, i8 -1) - %res2 = fadd <8 x double> %res, %res1 + %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + %4 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x1) + %res2 = fadd <8 x double> %3, %4 ret <8 x double> %res2 } -declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16) - define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovaps %zmm1, %zmm3 ; CHECK-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z} ; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) - %res2 = fadd <16 x float> %res, %res1 + %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + %4 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2) + %res2 = fadd <16 x float> %3, %4 ret <16 x float> %res2 } - -declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) - define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 ; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm3, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 + %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + %4 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2) + %res2 = add <8 x i64> %3, %4 ret <8 x i64> %res2 } -declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) - define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 ; CHECK-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vpaddd %zmm3, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 + %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 + %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) + %res2 = add <16 x i32> %3, %4 ret <16 x i32> %res2 } diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 422e0ee9fa0..b64084909bd 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -2787,3 +2787,81 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x %res4 = add <32 x i16> %res3, %res2 ret <32 x i16> %res4 } + +declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3 +; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} + +declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) + +define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm1, %zmm3 +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm1, %zmm3 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) + %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + %res2 = add <32 x i16> %res, %res1 + ret <32 x i16> %res2 +} diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 337d72b69bf..12cd5ad3624 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -995,81 +995,83 @@ define <32 x i16> @test_mask_subs_epu16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) -declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) - define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: ; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm1, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} -; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm1, %zmm3, %zmm0 +; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 + %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x1 + %4 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) + %res2 = add <32 x i16> %3, %4 ret <32 x i16> %res2 } -declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) - define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; AVX512BW-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: ; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z} -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm1, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: ; AVX512F-32: # %bb.0: -; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z} -; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm1, %zmm3, %zmm0 +; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 + %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + %4 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) + %res2 = add <32 x i16> %3, %4 ret <32 x i16> %res2 } -declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>) define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: ; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: ; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vmovdqa64 %zmm0, %zmm3 +; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm1, %zmm3 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3 -; AVX512F-32-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 {%k1} -; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm3, %zmm0 +; AVX512F-32-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl - %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 + %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x1 + %4 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) + %res2 = add <32 x i16> %3, %4 ret <32 x i16> %res2 } diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index 11a7cc306f5..1c517d05c68 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -4050,3 +4050,105 @@ define <16 x i16>@test_int_x86_avx512_mask_permvar_hi_256(<16 x i16> %x0, <16 x %res4 = add <16 x i16> %res3, %res2 ret <16 x i16> %res4 } + +declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] +; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xca] +; CHECK-NEXT: vpaddw %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] +; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xca] +; CHECK-NEXT: vpaddw %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] +; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xca] +; CHECK-NEXT: vpaddw %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] +; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xca] +; CHECK-NEXT: vpaddw %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} + +declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) + +define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; CHECK-NEXT: vpermt2w %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpermi2w %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x75,0xca] +; CHECK-NEXT: vpaddw %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) + %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + %res2 = add <8 x i16> %res, %res1 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) + +define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; CHECK-NEXT: vpermt2w %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x75,0xca] +; CHECK-NEXT: vpaddw %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) + %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + %res2 = add <16 x i16> %res, %res1 + ret <16 x i16> %res2 +} diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll index 02e69fea203..607fd45adc6 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -1798,105 +1798,109 @@ define <32 x i8> @test_mask_subs_epu8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, declare <32 x i8> @llvm.x86.avx512.mask.psubus.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) -declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) - define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] ; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xca] ; CHECK-NEXT: vpaddw %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 + %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x1 + %4 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) + %res2 = add <8 x i16> %3, %4 ret <8 x i16> %res2 } -declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) - define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] ; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xca] ; CHECK-NEXT: vpaddw %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 + %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + %4 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) + %res2 = add <8 x i16> %3, %4 ret <8 x i16> %res2 } -declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) - define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] ; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xca] ; CHECK-NEXT: vpaddw %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 + %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x1 + %4 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) + %res2 = add <16 x i16> %3, %4 ret <16 x i16> %res2 } -declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) - define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] ; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xca] ; CHECK-NEXT: vpaddw %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 + %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + %4 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) + %res2 = add <16 x i16> %3, %4 ret <16 x i16> %res2 } -declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +declare <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>) define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] ; CHECK-NEXT: vpermt2w %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpermi2w %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x75,0xca] ; CHECK-NEXT: vpaddw %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 + %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x1 + %4 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) + %res2 = add <8 x i16> %3, %4 ret <8 x i16> %res2 } -declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +declare <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>) define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] ; CHECK-NEXT: vpermt2w %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x75,0xca] ; CHECK-NEXT: vpaddw %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 + %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x1 + %4 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) + %res2 = add <16 x i16> %3, %4 ret <16 x i16> %res2 } diff --git a/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll index 014b055dfc1..be7d8ddd0cb 100644 --- a/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll @@ -19,3 +19,59 @@ define <64 x i8>@test_int_x86_avx512_mask_permvar_qi_512(<64 x i8> %x0, <64 x i8 %res4 = add <64 x i8> %res3, %res2 ret <64 x i8> %res4 } + +declare <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) + +define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 +; CHECK-NEXT: vpermt2b %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpermi2b %zmm2, %zmm0, %zmm4 {%k1} {z} +; CHECK-NEXT: vpaddb %zmm3, %zmm4, %zmm0 +; CHECK-NEXT: vpaddb %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) + %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2, i64 %x3) + %res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + %res3 = add <64 x i8> %res, %res1 + %res4 = add <64 x i8> %res3, %res2 + ret <64 x i8> %res4 +} + +declare <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) + +define <64 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 +; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z} +; CHECK-NEXT: vpaddb %zmm3, %zmm4, %zmm0 +; CHECK-NEXT: vpaddb %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq + %res = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) + %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2, i64 %x3) + %res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + %res3 = add <64 x i8> %res, %res1 + %res4 = add <64 x i8> %res3, %res2 + ret <64 x i8> %res4 +} + +declare <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) + +define <64 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpermi2b %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) + ret <64 x i8> %res +} diff --git a/test/CodeGen/X86/avx512vbmi-intrinsics.ll b/test/CodeGen/X86/avx512vbmi-intrinsics.ll index 26ac03fd98c..c10a110be49 100644 --- a/test/CodeGen/X86/avx512vbmi-intrinsics.ll +++ b/test/CodeGen/X86/avx512vbmi-intrinsics.ll @@ -45,58 +45,64 @@ define <64 x i8>@test_int_x86_avx512_mask_pmultishift_qb_512(<64 x i8> %x0, <64 ret <64 x i8> %res4 } -declare <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +declare <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>) define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovq %rdi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vpermt2b %zmm2, %zmm1, %zmm3 +; CHECK-NEXT: kmovq %rdi, %k1 ; CHECK-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vpermi2b %zmm2, %zmm0, %zmm4 {%k1} {z} ; CHECK-NEXT: vpaddb %zmm3, %zmm4, %zmm0 ; CHECK-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq - %res = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2, i64 %x3) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res3, %res2 + %1 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) + %2 = bitcast i64 %x3 to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x1 + %4 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2) + %5 = bitcast i64 %x3 to <64 x i1> + %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer + %7 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) + %res3 = add <64 x i8> %3, %6 + %res4 = add <64 x i8> %res3, %7 ret <64 x i8> %res4 } -declare <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) - define <64 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovq %rdi, %k1 ; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 -; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 {%k1} -; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z} -; CHECK-NEXT: vpaddb %zmm1, %zmm4, %zmm0 -; CHECK-NEXT: vpaddb %zmm0, %zmm3, %zmm0 -; CHECK-NEXT: retq - %res = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2, i64 %x3) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res3, %res2 +; CHECK-NEXT: vpaddb %zmm3, %zmm4, %zmm0 +; CHECK-NEXT: vpaddb %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: ret + %1 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2) + %2 = bitcast i64 %x3 to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x1 + %4 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> zeroinitializer, <64 x i8> %x0, <64 x i8> %x2) + %5 = bitcast i64 %x3 to <64 x i1> + %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer + %7 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2) + %res3 = add <64 x i8> %3, %6 + %res4 = add <64 x i8> %res3, %7 ret <64 x i8> %res4 } -declare <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) - define <64 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovq %rdi, %k1 ; CHECK-NEXT: vpermi2b %zmm2, %zmm1, %zmm0 {%k1} {z} ; CHECK-NEXT: retq - %res = call <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - ret <64 x i8> %res + %1 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2) + %2 = bitcast i64 %x3 to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer + ret <64 x i8> %3 } diff --git a/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll index efdcad2d7a2..e3fbeea20a9 100644 --- a/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll @@ -40,3 +40,115 @@ define <32 x i8>@test_int_x86_avx512_mask_permvar_qi_256(<32 x i8> %x0, <32 x i8 %res4 = add <32 x i8> %res3, %res2 ret <32 x i8> %res4 } + +declare <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; CHECK-NEXT: vpermt2b %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpermi2b %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] +; CHECK-NEXT: vpermi2b %xmm2, %xmm0, %xmm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x75,0xe2] +; CHECK-NEXT: vpaddb %xmm3, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] +; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2, i16 %x3) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + %res3 = add <16 x i8> %res, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; CHECK-NEXT: vpermt2b %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] +; CHECK-NEXT: vpermi2b %ymm2, %ymm0, %ymm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x75,0xe2] +; CHECK-NEXT: vpaddb %ymm3, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] +; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> zeroinitializer, <32 x i8> %x2, i32 %x3) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + %res3 = add <32 x i8> %res, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} + +declare <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] +; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xca] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] +; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xe2] +; CHECK-NEXT: vpaddb %xmm3, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] +; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) + %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2, i16 %x3) + %res2 = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + %res3 = add <16 x i8> %res, %res1 + %res4 = add <16 x i8> %res3, %res2 + ret <16 x i8> %res4 +} + +declare <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] +; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xca] +; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] +; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xe2] +; CHECK-NEXT: vpaddb %ymm3, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] +; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) + %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> zeroinitializer, <32 x i8> %x2, i32 %x3) + %res2 = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + %res3 = add <32 x i8> %res, %res1 + %res4 = add <32 x i8> %res3, %res2 + ret <32 x i8> %res4 +} + +declare <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) + +define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpermi2b %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0x75,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) + ret <16 x i8> %res +} + +declare <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpermi2b %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x75,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) + ret <32 x i8> %res +} diff --git a/test/CodeGen/X86/avx512vbmivl-intrinsics.ll b/test/CodeGen/X86/avx512vbmivl-intrinsics.ll index 6bedb5d9069..3a9cc77b6b8 100644 --- a/test/CodeGen/X86/avx512vbmivl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vbmivl-intrinsics.ll @@ -89,114 +89,126 @@ define <32 x i8>@test_int_x86_avx512_mask_pmultishift_qb_256(<32 x i8> %x0, <32 ret <32 x i8> %res4 } -declare <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +declare <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>) define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] ; CHECK-NEXT: vpermt2b %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpermi2b %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca] ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] ; CHECK-NEXT: vpermi2b %xmm2, %xmm0, %xmm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x75,0xe2] ; CHECK-NEXT: vpaddb %xmm3, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] ; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2, i16 %x3) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res3, %res2 + %1 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x1 + %4 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2) + %5 = bitcast i16 %x3 to <16 x i1> + %6 = select <16 x i1> %5, <16 x i8> %4, <16 x i8> zeroinitializer + %7 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) + %res3 = add <16 x i8> %3, %6 + %res4 = add <16 x i8> %res3, %7 ret <16 x i8> %res4 } -declare <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +declare <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>) define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] ; CHECK-NEXT: vpermt2b %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca] ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] ; CHECK-NEXT: vpermi2b %ymm2, %ymm0, %ymm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x75,0xe2] ; CHECK-NEXT: vpaddb %ymm3, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] ; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> zeroinitializer, <32 x i8> %x2, i32 %x3) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res3, %res2 + %1 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x1 + %4 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> zeroinitializer, <32 x i8> %x2) + %5 = bitcast i32 %x3 to <32 x i1> + %6 = select <32 x i1> %5, <32 x i8> %4, <32 x i8> zeroinitializer + %7 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) + %res3 = add <32 x i8> %3, %6 + %res4 = add <32 x i8> %res3, %7 ret <32 x i8> %res4 } -declare <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) - define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] ; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xca] ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] ; CHECK-NEXT: vpermt2b %xmm2, %xmm0, %xmm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xe2] ; CHECK-NEXT: vpaddb %xmm3, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] ; CHECK-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2, i16 %x3) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res3, %res2 + %1 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x1, <16 x i8> %x0, <16 x i8> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x1 + %4 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> zeroinitializer, <16 x i8> %x0, <16 x i8> %x2) + %5 = bitcast i16 %x3 to <16 x i1> + %6 = select <16 x i1> %5, <16 x i8> %4, <16 x i8> zeroinitializer + %7 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x1, <16 x i8> %x0, <16 x i8> %x2) + %res3 = add <16 x i8> %3, %6 + %res4 = add <16 x i8> %res3, %7 ret <16 x i8> %res4 } -declare <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) - define <32 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xda] -; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xca] +; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xda] +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xca] ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] ; CHECK-NEXT: vpermt2b %ymm2, %ymm0, %ymm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xe2] -; CHECK-NEXT: vpaddb %ymm1, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc1] -; CHECK-NEXT: vpaddb %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfc,0xc0] +; CHECK-NEXT: vpaddb %ymm3, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] +; CHECK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> zeroinitializer, <32 x i8> %x2, i32 %x3) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res3, %res2 + %1 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x1, <32 x i8> %x0, <32 x i8> %x2) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x1 + %4 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> zeroinitializer, <32 x i8> %x0, <32 x i8> %x2) + %5 = bitcast i32 %x3 to <32 x i1> + %6 = select <32 x i1> %5, <32 x i8> %4, <32 x i8> zeroinitializer + %7 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x1, <32 x i8> %x0, <32 x i8> %x2) + %res3 = add <32 x i8> %3, %6 + %res4 = add <32 x i8> %res3, %7 ret <32 x i8> %res4 } -declare <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) - define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpermi2b %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0x75,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - ret <16 x i8> %res + %1 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x1, <16 x i8> %x0, <16 x i8> %x2) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 } -declare <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) - define <32 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_256: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpermi2b %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x75,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - ret <32 x i8> %res + %1 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x1, <32 x i8> %x0, <32 x i8> %x2) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 } diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 17a319d4649..372b031ec9c 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -6985,3 +6985,287 @@ define <8 x float>@test_int_x86_avx512_mask_cvt_udq2ps_256(<8 x i32> %x0, <8 x f %res2 = fadd <8 x float> %res, %res1 ret <8 x float> %res2 } + +declare <4 x i32> @llvm.x86.avx512.mask.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; CHECK-NEXT: vpermt2d %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0x75,0x08,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x76,0xca] +; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] +; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7e,0xca] +; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) + +define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] +; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7e,0xca] +; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) + %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + %res2 = add <4 x i32> %res, %res1 + ret <4 x i32> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; CHECK-NEXT: vpermt2d %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0x75,0x28,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x76,0xca] +; CHECK-NEXT: vpaddd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] +; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7e,0xca] +; CHECK-NEXT: vpaddd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] +; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7e,0xca] +; CHECK-NEXT: vpaddd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) + %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + %res2 = add <8 x i32> %res, %res1 + ret <8 x i32> %res2 +} + +declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8) + +define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovapd %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8] +; CHECK-NEXT: vpermt2pd %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0xf5,0x08,0x7f,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x77,0xca] +; CHECK-NEXT: vaddpd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) + %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) + %res2 = fadd <2 x double> %res, %res1 + ret <2 x double> %res2 +} + +declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8) + +define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovapd %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8] +; CHECK-NEXT: vpermt2pd %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0xf5,0x28,0x7f,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x77,0xca] +; CHECK-NEXT: vaddpd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) + %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) + %res2 = fadd <4 x double> %res, %res1 + ret <4 x double> %res2 +} + +declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8] +; CHECK-NEXT: vpermt2ps %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0x75,0x08,0x7f,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x77,0xca] +; CHECK-NEXT: vaddps %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) + %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) + %res2 = fadd <4 x float> %res, %res1 + ret <4 x float> %res2 +} + +define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> %x0, <2 x i64> %x1, <4 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128_cast: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x77,0xca] +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %x1cast = bitcast <2 x i64> %x1 to <4 x i32> + %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1cast, <4 x float> %x2, i8 %x3) + ret <4 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8] +; CHECK-NEXT: vpermt2ps %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0x75,0x28,0x7f,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x77,0xca] +; CHECK-NEXT: vaddps %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) + %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) + %res2 = fadd <8 x float> %res, %res1 + ret <8 x float> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.mask.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; CHECK-NEXT: vpermt2q %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0xf5,0x08,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x76,0xca] +; CHECK-NEXT: vpaddq %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.mask.vpermt2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_mask_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_q_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] +; CHECK-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7e,0xca] +; CHECK-NEXT: vpaddq %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) + %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <2 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] +; CHECK-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7e,0xca] +; CHECK-NEXT: vpaddq %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) + %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + %res2 = add <2 x i64> %res, %res1 + ret <2 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.mask.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; CHECK-NEXT: vpermt2q %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0xf5,0x28,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x76,0xca] +; CHECK-NEXT: vpaddq %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.mask.vpermt2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_mask_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_q_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] +; CHECK-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7e,0xca] +; CHECK-NEXT: vpaddq %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) + %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] +; CHECK-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7e,0xca] +; CHECK-NEXT: vpaddq %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) + %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + %res2 = add <4 x i64> %res, %res1 + ret <4 x i64> %res2 +} diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index b6531a907a4..cf81ecc92e3 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -528,122 +528,178 @@ define <8 x float> @test_getexp_ps_256(<8 x float> %a0) { } declare <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone -declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>) + +define <4 x i32>@test_int_x86_avx512_mask_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; CHECK-NEXT: vpermt2d %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0x75,0x08,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x76,0xca] +; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x1 + %4 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + %res2 = add <4 x i32> %3, %4 + ret <4 x i32> %res2 +} define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] ; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7e,0xca] ; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) - %res2 = add <4 x i32> %res, %res1 + %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x1 + %4 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) + %res2 = add <4 x i32> %3, %4 ret <4 x i32> %res2 } -declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) - define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] ; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7e,0xca] ; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) - %res2 = add <4 x i32> %res, %res1 + %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer + %4 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) + %res2 = add <4 x i32> %3, %4 ret <4 x i32> %res2 } -declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>) + +define <8 x i32>@test_int_x86_avx512_mask_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; CHECK-NEXT: vpermt2d %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0x75,0x28,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x76,0xca] +; CHECK-NEXT: vpaddd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1 + %4 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + %res2 = add <8 x i32> %3, %4 + ret <8 x i32> %res2 +} define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] ; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7e,0xca] ; CHECK-NEXT: vpaddd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 + %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1 + %4 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) + %res2 = add <8 x i32> %3, %4 ret <8 x i32> %res2 } -declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) - define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] ; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7e,0xca] ; CHECK-NEXT: vpaddd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 + %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer + %4 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) + %res2 = add <8 x i32> %3, %4 ret <8 x i32> %res2 } -declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8) +declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>) define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovapd %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8] ; CHECK-NEXT: vpermt2pd %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0xf5,0x08,0x7f,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x77,0xca] ; CHECK-NEXT: vaddpd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) - %res2 = fadd <2 x double> %res, %res1 + %1 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) + %2 = bitcast <2 x i64> %x1 to <2 x double> + %3 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <2 x i32> + %4 = select <2 x i1> %extract, <2 x double> %1, <2 x double> %2 + %5 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) + %6 = bitcast <2 x i64> %x1 to <2 x double> + %res2 = fadd <2 x double> %4, %5 ret <2 x double> %res2 } -declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8) +declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>) define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovapd %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8] ; CHECK-NEXT: vpermt2pd %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0xf5,0x28,0x7f,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x77,0xca] ; CHECK-NEXT: vaddpd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) - %res2 = fadd <4 x double> %res, %res1 + %1 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) + %2 = bitcast <4 x i64> %x1 to <4 x double> + %3 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> + %4 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %2 + %5 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) + %6 = bitcast <4 x i64> %x1 to <4 x double> + %res2 = fadd <4 x double> %4, %5 ret <4 x double> %res2 } -declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8) +declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>) define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovaps %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8] ; CHECK-NEXT: vpermt2ps %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0x75,0x08,0x7f,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x77,0xca] ; CHECK-NEXT: vaddps %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) - %res2 = fadd <4 x float> %res, %res1 + %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) + %2 = bitcast <4 x i32> %x1 to <4 x float> + %3 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> + %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %2 + %5 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) + %6 = bitcast <4 x i32> %x1 to <4 x float> + %res2 = fadd <4 x float> %4, %5 ret <4 x float> %res2 } @@ -655,27 +711,147 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> % ; CHECK-NEXT: vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %x1cast = bitcast <2 x i64> %x1 to <4 x i32> - %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1cast, <4 x float> %x2, i8 %x3) - ret <4 x float> %res + %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1cast, <4 x float> %x2) + %2 = bitcast <4 x i32> %x1cast to <4 x float> + %3 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> + %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %2 + ret <4 x float> %4 } -declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8) +declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>) define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovaps %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8] ; CHECK-NEXT: vpermt2ps %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0x75,0x28,0x7f,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x77,0xca] ; CHECK-NEXT: vaddps %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3] ; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) - %res2 = fadd <8 x float> %res, %res1 + %1 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) + %2 = bitcast <8 x i32> %x1 to <8 x float> + %3 = bitcast i8 %x3 to <8 x i1> + %4 = select <8 x i1> %3, <8 x float> %1, <8 x float> %2 + %5 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) + %6 = bitcast <8 x i32> %x1 to <8 x float> + %res2 = fadd <8 x float> %4, %5 ret <8 x float> %res2 } +declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>) + +define <2 x i64>@test_int_x86_avx512_mask_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] +; CHECK-NEXT: vpermt2q %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0xf5,0x08,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x76,0xca] +; CHECK-NEXT: vpaddq %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1 + %4 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + %res2 = add <2 x i64> %3, %4 + ret <2 x i64> %res2 +} + +define <2 x i64>@test_int_x86_avx512_mask_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_q_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] +; CHECK-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7e,0xca] +; CHECK-NEXT: vpaddq %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1 + %4 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) + %res2 = add <2 x i64> %3, %4 + ret <2 x i64> %res2 +} + +define <2 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] +; CHECK-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7e,0xca] +; CHECK-NEXT: vpaddq %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> + %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> zeroinitializer + %4 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) + %res2 = add <2 x i64> %3, %4 + ret <2 x i64> %res2 +} + +declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>) + +define <4 x i64>@test_int_x86_avx512_mask_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] +; CHECK-NEXT: vpermt2q %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0xf5,0x28,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x76,0xca] +; CHECK-NEXT: vpaddq %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1 + %4 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + %res2 = add <4 x i64> %3, %4 + ret <4 x i64> %res2 +} + +define <4 x i64>@test_int_x86_avx512_mask_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_q_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] +; CHECK-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7e,0xca] +; CHECK-NEXT: vpaddq %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1 + %4 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) + %res2 = add <4 x i64> %3, %4 + ret <4 x i64> %res2 +} + +define <4 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] +; CHECK-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7e,0xca] +; CHECK-NEXT: vpaddq %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) + %2 = bitcast i8 %x3 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> zeroinitializer + %4 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) + %res2 = add <4 x i64> %3, %4 + ret <4 x i64> %res2 +} + declare <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) define <2 x double>@test_int_x86_avx512_mask_scalef_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index 245eff5fec3..64dd50639d3 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -124,9 +124,9 @@ define <8 x double> @combine_vpermt2var_8f64_identity(<8 x double> %x0, <8 x dou define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 x double> %x1, i8 %m) { ; X32-LABEL: combine_vpermt2var_8f64_identity_mask: ; X32: # %bb.0: +; X32-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovd %eax, %k1 -; X32-NEXT: vmovapd {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X32-NEXT: vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z} ; X32-NEXT: vmovapd {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] ; X32-NEXT: vpermi2pd %zmm2, %zmm2, %zmm0 {%k1} {z} @@ -134,8 +134,8 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8 ; ; X64-LABEL: combine_vpermt2var_8f64_identity_mask: ; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z} ; X64-NEXT: vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] ; X64-NEXT: vpermi2pd %zmm2, %zmm2, %zmm0 {%k1} {z} @@ -205,9 +205,9 @@ define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x1, i8 %m) { ; X32-LABEL: combine_vpermt2var_8i64_identity_mask: ; X32: # %bb.0: +; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X32-NEXT: kmovd %eax, %k1 -; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0] ; X32-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z} ; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0] ; X32-NEXT: vpermi2q %zmm2, %zmm2, %zmm0 {%k1} {z} @@ -215,8 +215,8 @@ define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64> ; ; X64-LABEL: combine_vpermt2var_8i64_identity_mask: ; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0] +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z} ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] ; X64-NEXT: vpermi2q %zmm2, %zmm2, %zmm0 {%k1} {z} @@ -241,8 +241,8 @@ define <16 x float> @combine_vpermt2var_16f32_identity(<16 x float> %x0, <16 x f define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { ; X32-LABEL: combine_vpermt2var_16f32_identity_mask: ; X32: # %bb.0: -; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z} ; X32-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X32-NEXT: vpermi2ps %zmm2, %zmm2, %zmm0 {%k1} {z} @@ -250,8 +250,8 @@ define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <1 ; ; X64-LABEL: combine_vpermt2var_16f32_identity_mask: ; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z} ; X64-NEXT: vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X64-NEXT: vpermi2ps %zmm2, %zmm2, %zmm0 {%k1} {z} @@ -300,15 +300,15 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, < define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <16 x float> %x1, i16 %m) { ; X32-LABEL: combine_vpermt2var_16f32_vmovddup_mask: ; X32: # %bb.0: -; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z} ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask: ; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z} ; X64-NEXT: retq %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) @@ -317,19 +317,19 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <1 define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%p0, <16 x float> %x1, i16 %m) { ; X32-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load: ; X32: # %bb.0: -; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vmovaps (%eax), %zmm2 ; X32-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 {%k1} {z} ; X32-NEXT: vmovaps %zmm1, %zmm0 ; X32-NEXT: retl ; ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load: ; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vmovaps (%rdi), %zmm2 ; X64-NEXT: vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] +; X64-NEXT: kmovd %esi, %k1 ; X64-NEXT: vpermi2ps %zmm0, %zmm2, %zmm1 {%k1} {z} ; X64-NEXT: vmovaps %zmm1, %zmm0 ; X64-NEXT: retq @@ -519,8 +519,8 @@ define <16 x i32> @combine_vpermt2var_16i32_identity(<16 x i32> %x0, <16 x i32> define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x i32> %x1, i16 %m) { ; X32-LABEL: combine_vpermt2var_16i32_identity_mask: ; X32: # %bb.0: -; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z} ; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X32-NEXT: vpermi2d %zmm2, %zmm2, %zmm0 {%k1} {z} @@ -528,8 +528,8 @@ define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x ; ; X64-LABEL: combine_vpermt2var_16i32_identity_mask: ; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z} ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X64-NEXT: vpermi2d %zmm2, %zmm2, %zmm0 {%k1} {z} @@ -554,8 +554,8 @@ define <32 x i16> @combine_vpermt2var_32i16_identity(<32 x i16> %x0, <32 x i16> define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x i16> %x1, i32 %m) { ; X32-LABEL: combine_vpermt2var_32i16_identity_mask: ; X32: # %bb.0: -; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z} ; X32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] ; X32-NEXT: vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z} @@ -563,8 +563,8 @@ define <32 x i16> @combine_vpermt2var_32i16_identity_mask(<32 x i16> %x0, <32 x ; ; X64-LABEL: combine_vpermt2var_32i16_identity_mask: ; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 {%k1} {z} ; X64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [63,30,61,28,59,26,57,24,55,22,53,20,51,18,49,16,47,46,13,44,11,42,9,40,7,38,5,36,3,34,1,32] ; X64-NEXT: vpermi2w %zmm2, %zmm2, %zmm0 {%k1} {z} diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll index 006b89fc06f..0d56a24c5ff 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll @@ -20,8 +20,8 @@ define <16 x i16> @combine_vpermt2var_16i16_identity(<16 x i16> %x0, <16 x i16> define <16 x i16> @combine_vpermt2var_16i16_identity_mask(<16 x i16> %x0, <16 x i16> %x1, i16 %m) { ; X32-LABEL: combine_vpermt2var_16i16_identity_mask: ; X32: # %bb.0: -; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z} ; X32-NEXT: vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X32-NEXT: vpermi2w %ymm2, %ymm2, %ymm0 {%k1} {z} @@ -29,8 +29,8 @@ define <16 x i16> @combine_vpermt2var_16i16_identity_mask(<16 x i16> %x0, <16 x ; ; X64-LABEL: combine_vpermt2var_16i16_identity_mask: ; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovdqa {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z} ; X64-NEXT: vmovdqa {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X64-NEXT: vpermi2w %ymm2, %ymm2, %ymm0 {%k1} {z} diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll index 55289b7900e..a9054102460 100644 --- a/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll +++ b/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll @@ -33,8 +33,8 @@ define <16 x i8> @combine_vpermt2var_16i8_identity(<16 x i8> %x0, <16 x i8> %x1) define <16 x i8> @combine_vpermt2var_16i8_identity_mask(<16 x i8> %x0, <16 x i8> %x1, i16 %m) { ; X32-LABEL: combine_vpermt2var_16i8_identity_mask: ; X32: # %bb.0: -; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X32-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z} ; X32-NEXT: vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X32-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 {%k1} {z} @@ -42,8 +42,8 @@ define <16 x i8> @combine_vpermt2var_16i8_identity_mask(<16 x i8> %x0, <16 x i8> ; ; X64-LABEL: combine_vpermt2var_16i8_identity_mask: ; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vmovdqa {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0] +; X64-NEXT: kmovd %edi, %k1 ; X64-NEXT: vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z} ; X64-NEXT: vmovdqa {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16] ; X64-NEXT: vpermi2b %xmm2, %xmm2, %xmm0 {%k1} {z}