mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 10:42:39 +01:00
AVX512: VPMOVAPS/PD and VPMOVUPS/PD (load) intrinsic implementation.
Differential Revision: http://reviews.llvm.org/D16042 llvm-svn: 257463
This commit is contained in:
parent
04fcf95002
commit
46e273fe48
@ -1890,25 +1890,69 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
|
||||
def int_x86_avx_maskload_ps_256 : GCCBuiltin<"__builtin_ia32_maskloadps256">,
|
||||
Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8i32_ty],
|
||||
[IntrReadArgMem]>;
|
||||
def int_x86_avx512_mask_loadu_ps_512 : GCCBuiltin<"__builtin_ia32_loadups512_mask">,
|
||||
Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty],
|
||||
[IntrReadArgMem]>;
|
||||
def int_x86_avx512_mask_loadu_pd_512 : GCCBuiltin<"__builtin_ia32_loadupd512_mask">,
|
||||
Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
|
||||
[IntrReadArgMem]>;
|
||||
def int_x86_avx512_mask_load_ps_512 : GCCBuiltin<"__builtin_ia32_loadaps512_mask">,
|
||||
Intrinsic<[llvm_v16f32_ty], [llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty],
|
||||
[IntrReadArgMem]>;
|
||||
def int_x86_avx512_mask_load_pd_512 : GCCBuiltin<"__builtin_ia32_loadapd512_mask">,
|
||||
Intrinsic<[llvm_v8f64_ty], [llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty],
|
||||
[IntrReadArgMem]>;
|
||||
|
||||
def int_x86_avx512_mask_move_ss : GCCBuiltin<"__builtin_ia32_movss_mask">,
|
||||
Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
|
||||
[IntrNoMem]>;
|
||||
def int_x86_avx512_mask_move_sd : GCCBuiltin<"__builtin_ia32_movsd_mask">,
|
||||
Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
|
||||
[IntrNoMem]>;
|
||||
def int_x86_avx512_mask_loadu_ps_128 :
|
||||
GCCBuiltin<"__builtin_ia32_loadups128_mask">,
|
||||
Intrinsic<[llvm_v4f32_ty],
|
||||
[llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrReadArgMem]>;
|
||||
def int_x86_avx512_mask_loadu_ps_256 :
|
||||
GCCBuiltin<"__builtin_ia32_loadups256_mask">,
|
||||
Intrinsic<[llvm_v8f32_ty],
|
||||
[llvm_ptr_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrReadArgMem]>;
|
||||
def int_x86_avx512_mask_loadu_ps_512 :
|
||||
GCCBuiltin<"__builtin_ia32_loadups512_mask">,
|
||||
Intrinsic<[llvm_v16f32_ty],
|
||||
[llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrReadArgMem]>;
|
||||
|
||||
def int_x86_avx512_mask_loadu_pd_128 :
|
||||
GCCBuiltin<"__builtin_ia32_loadupd128_mask">,
|
||||
Intrinsic<[llvm_v2f64_ty],
|
||||
[llvm_ptr_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrReadArgMem]>;
|
||||
def int_x86_avx512_mask_loadu_pd_256 :
|
||||
GCCBuiltin<"__builtin_ia32_loadupd256_mask">,
|
||||
Intrinsic<[llvm_v4f64_ty],
|
||||
[llvm_ptr_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrReadArgMem]>;
|
||||
def int_x86_avx512_mask_loadu_pd_512 :
|
||||
GCCBuiltin<"__builtin_ia32_loadupd512_mask">,
|
||||
Intrinsic<[llvm_v8f64_ty],
|
||||
[llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrReadArgMem]>;
|
||||
|
||||
def int_x86_avx512_mask_load_ps_128 :
|
||||
GCCBuiltin<"__builtin_ia32_loadaps128_mask">,
|
||||
Intrinsic<[llvm_v4f32_ty],
|
||||
[llvm_ptr_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrReadArgMem]>;
|
||||
def int_x86_avx512_mask_load_ps_256 :
|
||||
GCCBuiltin<"__builtin_ia32_loadaps256_mask">,
|
||||
Intrinsic<[llvm_v8f32_ty],
|
||||
[llvm_ptr_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrReadArgMem]>;
|
||||
def int_x86_avx512_mask_load_ps_512 :
|
||||
GCCBuiltin<"__builtin_ia32_loadaps512_mask">,
|
||||
Intrinsic<[llvm_v16f32_ty],
|
||||
[llvm_ptr_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrReadArgMem]>;
|
||||
|
||||
def int_x86_avx512_mask_load_pd_128 :
|
||||
GCCBuiltin<"__builtin_ia32_loadapd128_mask">,
|
||||
Intrinsic<[llvm_v2f64_ty],
|
||||
[llvm_ptr_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrReadArgMem]>;
|
||||
def int_x86_avx512_mask_load_pd_256 :
|
||||
GCCBuiltin<"__builtin_ia32_loadapd256_mask">,
|
||||
Intrinsic<[llvm_v4f64_ty],
|
||||
[llvm_ptr_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrReadArgMem]>;
|
||||
def int_x86_avx512_mask_load_pd_512 :
|
||||
GCCBuiltin<"__builtin_ia32_loadapd512_mask">,
|
||||
Intrinsic<[llvm_v8f64_ty],
|
||||
[llvm_ptr_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrReadArgMem]>;
|
||||
|
||||
def int_x86_avx512_mask_move_ss :
|
||||
GCCBuiltin<"__builtin_ia32_movss_mask">,
|
||||
Intrinsic<[llvm_v4f32_ty],
|
||||
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty],
|
||||
[IntrNoMem]>;
|
||||
def int_x86_avx512_mask_move_sd :
|
||||
GCCBuiltin<"__builtin_ia32_movsd_mask">,
|
||||
Intrinsic<[llvm_v2f64_ty],
|
||||
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty],
|
||||
[IntrNoMem]>;
|
||||
}
|
||||
|
||||
// Conditional store ops
|
||||
|
@ -4171,6 +4171,35 @@ static bool hasFPCMov(unsigned X86CC) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
||||
const CallInst &I,
|
||||
unsigned Intrinsic) const {
|
||||
|
||||
const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
|
||||
if (!IntrData)
|
||||
return false;
|
||||
|
||||
switch (IntrData->Type) {
|
||||
case LOADA:
|
||||
case LOADU: {
|
||||
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
||||
Info.memVT = MVT::getVT(I.getType());
|
||||
Info.ptrVal = I.getArgOperand(0);
|
||||
Info.offset = 0;
|
||||
Info.align = (IntrData->Type == LOADA ? Info.memVT.getSizeInBits()/8 : 1);
|
||||
Info.vol = false;
|
||||
Info.readMem = true;
|
||||
Info.writeMem = false;
|
||||
return true;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Returns true if the target can instruction select the
|
||||
/// specified FP immediate natively. If false, the legalizer will
|
||||
/// materialize the FP immediate as a load from a constant pool.
|
||||
@ -17538,7 +17567,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
|
||||
return DAG.getMergeValues(Results, dl);
|
||||
}
|
||||
case COMPRESS_TO_MEM: {
|
||||
SDLoc dl(Op);
|
||||
SDValue Mask = Op.getOperand(4);
|
||||
SDValue DataToCompress = Op.getOperand(3);
|
||||
SDValue Addr = Op.getOperand(2);
|
||||
@ -17564,7 +17592,6 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
|
||||
case TRUNCATE_TO_MEM_VI32:
|
||||
return LowerINTRINSIC_TRUNCATE_TO_MEM(Op, DAG, MVT::i32);
|
||||
case EXPAND_FROM_MEM: {
|
||||
SDLoc dl(Op);
|
||||
SDValue Mask = Op.getOperand(4);
|
||||
SDValue PassThru = Op.getOperand(3);
|
||||
SDValue Addr = Op.getOperand(2);
|
||||
@ -17584,6 +17611,25 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
|
||||
Mask, PassThru, Subtarget, DAG), Chain};
|
||||
return DAG.getMergeValues(Results, dl);
|
||||
}
|
||||
case LOADU:
|
||||
case LOADA: {
|
||||
SDValue Mask = Op.getOperand(4);
|
||||
SDValue PassThru = Op.getOperand(3);
|
||||
SDValue Addr = Op.getOperand(2);
|
||||
SDValue Chain = Op.getOperand(0);
|
||||
MVT VT = Op.getSimpleValueType();
|
||||
|
||||
MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
|
||||
assert(MemIntr && "Expected MemIntrinsicSDNode!");
|
||||
|
||||
if (isAllOnesConstant(Mask)) // return just a load
|
||||
return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
|
||||
|
||||
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
|
||||
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
|
||||
return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
|
||||
MemIntr->getMemOperand(), ISD::NON_EXTLOAD);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -837,6 +837,13 @@ namespace llvm {
|
||||
/// from i32 to i8 but not from i32 to i16.
|
||||
bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
|
||||
|
||||
/// Given an intrinsic, checks if on the target the intrinsic will need to map
|
||||
/// to a MemIntrinsicNode (touches memory). If this is the case, it returns
|
||||
/// true and stores the intrinsic information into the IntrinsicInfo that was
|
||||
/// passed to the function.
|
||||
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
|
||||
unsigned Intrinsic) const override;
|
||||
|
||||
/// Returns true if the target can instruction select the
|
||||
/// specified FP immediate natively. If false, the legalizer will
|
||||
/// materialize the FP immediate as a load from a constant pool.
|
||||
|
@ -2707,30 +2707,6 @@ defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, 0>,
|
||||
avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>,
|
||||
PD, VEX_W, EVEX_CD8<64, CD8VF>;
|
||||
|
||||
def: Pat<(v8f64 (int_x86_avx512_mask_loadu_pd_512 addr:$ptr,
|
||||
(bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
|
||||
(VMOVUPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
|
||||
|
||||
def: Pat<(v16f32 (int_x86_avx512_mask_loadu_ps_512 addr:$ptr,
|
||||
(bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
|
||||
(VMOVUPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr,
|
||||
(bc_v8f64 (v16i32 immAllZerosV)), GR8:$mask)),
|
||||
(VMOVAPDZrmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
|
||||
|
||||
def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr,
|
||||
(bc_v16f32 (v16i32 immAllZerosV)), GR16:$mask)),
|
||||
(VMOVAPSZrmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
|
||||
|
||||
def: Pat<(v8f64 (int_x86_avx512_mask_load_pd_512 addr:$ptr,
|
||||
(bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))),
|
||||
(VMOVAPDZrm addr:$ptr)>;
|
||||
|
||||
def: Pat<(v16f32 (int_x86_avx512_mask_load_ps_512 addr:$ptr,
|
||||
(bc_v16f32 (v16i32 immAllZerosV)), (i16 -1))),
|
||||
(VMOVAPSZrm addr:$ptr)>;
|
||||
|
||||
def: Pat<(int_x86_avx512_mask_storeu_ps_512 addr:$ptr, (v16f32 VR512:$src),
|
||||
GR16:$mask),
|
||||
(VMOVUPSZmrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
|
||||
|
@ -29,7 +29,7 @@ enum IntrinsicType {
|
||||
INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
|
||||
COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC,
|
||||
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
|
||||
EXPAND_FROM_MEM, BLEND, INSERT_SUBVEC,
|
||||
EXPAND_FROM_MEM, LOADA, LOADU, BLEND, INSERT_SUBVEC,
|
||||
TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
|
||||
};
|
||||
|
||||
@ -143,6 +143,18 @@ static const IntrinsicData IntrinsicsWithChain[] = {
|
||||
EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
|
||||
EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_load_pd_128, LOADA, ISD::DELETED_NODE, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_load_pd_256, LOADA, ISD::DELETED_NODE, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_load_pd_512, LOADA, ISD::DELETED_NODE, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_load_ps_128, LOADA, ISD::DELETED_NODE, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_load_ps_256, LOADA, ISD::DELETED_NODE, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_load_ps_512, LOADA, ISD::DELETED_NODE, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_loadu_pd_128, LOADU, ISD::DELETED_NODE, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_loadu_pd_256, LOADU, ISD::DELETED_NODE, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_loadu_pd_512, LOADU, ISD::DELETED_NODE, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_loadu_ps_128, LOADU, ISD::DELETED_NODE, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_loadu_ps_256, LOADU, ISD::DELETED_NODE, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_loadu_ps_512, LOADU, ISD::DELETED_NODE, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
|
||||
X86ISD::VTRUNC, 0),
|
||||
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
|
||||
|
@ -907,49 +907,79 @@ define void @test_mask_store_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask)
|
||||
|
||||
declare void @llvm.x86.avx512.mask.store.pd.512(i8*, <8 x double>, i8)
|
||||
|
||||
define <16 x float> @test_maskz_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
|
||||
; CHECK-LABEL: test_maskz_load_aligned_ps:
|
||||
define <16 x float> @test_mask_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
|
||||
; CHECK-LABEL: test_mask_load_aligned_ps:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z}
|
||||
; CHECK-NEXT: vmovaps (%rdi), %zmm0
|
||||
; CHECK-NEXT: vmovaps (%rdi), %zmm0 {%k1}
|
||||
; CHECK-NEXT: vmovaps (%rdi), %zmm1 {%k1} {z}
|
||||
; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
|
||||
ret <16 x float> %res
|
||||
%res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
|
||||
%res1 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
|
||||
%res2 = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
|
||||
%res4 = fadd <16 x float> %res2, %res1
|
||||
ret <16 x float> %res4
|
||||
}
|
||||
|
||||
declare <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8*, <16 x float>, i16)
|
||||
|
||||
define <8 x double> @test_maskz_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
|
||||
; CHECK-LABEL: test_maskz_load_aligned_pd:
|
||||
define <16 x float> @test_mask_load_unaligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
|
||||
; CHECK-LABEL: test_mask_load_unaligned_ps:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: kmovw %esi, %k1
|
||||
; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z}
|
||||
; CHECK-NEXT: vmovups (%rdi), %zmm0
|
||||
; CHECK-NEXT: vmovups (%rdi), %zmm0 {%k1}
|
||||
; CHECK-NEXT: vmovups (%rdi), %zmm1 {%k1} {z}
|
||||
; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
|
||||
ret <8 x double> %res
|
||||
%res = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
|
||||
%res1 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> %res, i16 %mask)
|
||||
%res2 = call <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 %mask)
|
||||
%res4 = fadd <16 x float> %res2, %res1
|
||||
ret <16 x float> %res4
|
||||
}
|
||||
|
||||
declare <16 x float> @llvm.x86.avx512.mask.loadu.ps.512(i8*, <16 x float>, i16)
|
||||
|
||||
define <8 x double> @test_mask_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
|
||||
; CHECK-LABEL: test_mask_load_aligned_pd:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movzbl %sil, %eax
|
||||
; CHECK-NEXT: kmovw %eax, %k1
|
||||
; CHECK-NEXT: vmovapd (%rdi), %zmm0
|
||||
; CHECK-NEXT: vmovapd (%rdi), %zmm0 {%k1}
|
||||
; CHECK-NEXT: vmovapd (%rdi), %zmm1 {%k1} {z}
|
||||
; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
|
||||
%res1 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
|
||||
%res2 = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
|
||||
%res4 = fadd <8 x double> %res2, %res1
|
||||
ret <8 x double> %res4
|
||||
}
|
||||
|
||||
declare <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8*, <8 x double>, i8)
|
||||
|
||||
define <16 x float> @test_load_aligned_ps(<16 x float> %data, i8* %ptr, i16 %mask) {
|
||||
; CHECK-LABEL: test_load_aligned_ps:
|
||||
|
||||
define <8 x double> @test_mask_load_unaligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
|
||||
; CHECK-LABEL: test_mask_load_unaligned_pd:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vmovaps (%rdi), %zmm0
|
||||
; CHECK-NEXT: movzbl %sil, %eax
|
||||
; CHECK-NEXT: kmovw %eax, %k1
|
||||
; CHECK-NEXT: vmovupd (%rdi), %zmm0
|
||||
; CHECK-NEXT: vmovupd (%rdi), %zmm0 {%k1}
|
||||
; CHECK-NEXT: vmovupd (%rdi), %zmm1 {%k1} {z}
|
||||
; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <16 x float> @llvm.x86.avx512.mask.load.ps.512(i8* %ptr, <16 x float> zeroinitializer, i16 -1)
|
||||
ret <16 x float> %res
|
||||
%res = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
|
||||
%res1 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> %res, i8 %mask)
|
||||
%res2 = call <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 %mask)
|
||||
%res4 = fadd <8 x double> %res2, %res1
|
||||
ret <8 x double> %res4
|
||||
}
|
||||
|
||||
define <8 x double> @test_load_aligned_pd(<8 x double> %data, i8* %ptr, i8 %mask) {
|
||||
; CHECK-LABEL: test_load_aligned_pd:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: vmovapd (%rdi), %zmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x double> @llvm.x86.avx512.mask.load.pd.512(i8* %ptr, <8 x double> zeroinitializer, i8 -1)
|
||||
ret <8 x double> %res
|
||||
}
|
||||
|
||||
declare <8 x i64> @llvm.x86.avx512.movntdqa(i8*)
|
||||
declare <8 x double> @llvm.x86.avx512.mask.loadu.pd.512(i8*, <8 x double>, i8)
|
||||
|
||||
define <8 x i64> @test_valign_q(<8 x i64> %a, <8 x i64> %b) {
|
||||
; CHECK-LABEL: test_valign_q:
|
||||
|
@ -6388,6 +6388,158 @@ define <4 x i64>@test_int_x86_avx512_mask_psll_qi_256(<4 x i64> %x0, i8 %x1, <4
|
||||
ret <4 x i64> %res4
|
||||
}
|
||||
|
||||
define <8 x float> @test_mask_load_aligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
|
||||
; CHECK-LABEL: test_mask_load_aligned_ps_256:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movzbl %sil, %eax
|
||||
; CHECK-NEXT: kmovw %eax, %k1
|
||||
; CHECK-NEXT: vmovaps (%rdi), %ymm0
|
||||
; CHECK-NEXT: vmovaps (%rdi), %ymm0 {%k1}
|
||||
; CHECK-NEXT: vmovaps (%rdi), %ymm1 {%k1} {z}
|
||||
; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1)
|
||||
%res1 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> %res, i8 %mask)
|
||||
%res2 = call <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask)
|
||||
%res4 = fadd <8 x float> %res2, %res1
|
||||
ret <8 x float> %res4
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.x86.avx512.mask.load.ps.256(i8*, <8 x float>, i8)
|
||||
|
||||
define <8 x float> @test_mask_load_unaligned_ps_256(<8 x float> %data, i8* %ptr, i8 %mask) {
|
||||
; CHECK-LABEL: test_mask_load_unaligned_ps_256:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movzbl %sil, %eax
|
||||
; CHECK-NEXT: kmovw %eax, %k1
|
||||
; CHECK-NEXT: vmovups (%rdi), %ymm0
|
||||
; CHECK-NEXT: vmovups (%rdi), %ymm0 {%k1}
|
||||
; CHECK-NEXT: vmovups (%rdi), %ymm1 {%k1} {z}
|
||||
; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 -1)
|
||||
%res1 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> %res, i8 %mask)
|
||||
%res2 = call <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8* %ptr, <8 x float> zeroinitializer, i8 %mask)
|
||||
%res4 = fadd <8 x float> %res2, %res1
|
||||
ret <8 x float> %res4
|
||||
}
|
||||
|
||||
declare <8 x float> @llvm.x86.avx512.mask.loadu.ps.256(i8*, <8 x float>, i8)
|
||||
|
||||
define <4 x double> @test_mask_load_aligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
|
||||
; CHECK-LABEL: test_mask_load_aligned_pd_256:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movzbl %sil, %eax
|
||||
; CHECK-NEXT: kmovw %eax, %k1
|
||||
; CHECK-NEXT: vmovapd (%rdi), %ymm0
|
||||
; CHECK-NEXT: vmovapd (%rdi), %ymm0 {%k1}
|
||||
; CHECK-NEXT: vmovapd (%rdi), %ymm1 {%k1} {z}
|
||||
; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1)
|
||||
%res1 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> %res, i8 %mask)
|
||||
%res2 = call <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask)
|
||||
%res4 = fadd <4 x double> %res2, %res1
|
||||
ret <4 x double> %res4
|
||||
}
|
||||
|
||||
declare <4 x double> @llvm.x86.avx512.mask.load.pd.256(i8*, <4 x double>, i8)
|
||||
|
||||
define <4 x double> @test_mask_load_unaligned_pd_256(<4 x double> %data, i8* %ptr, i8 %mask) {
|
||||
; CHECK-LABEL: test_mask_load_unaligned_pd_256:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movzbl %sil, %eax
|
||||
; CHECK-NEXT: kmovw %eax, %k1
|
||||
; CHECK-NEXT: vmovupd (%rdi), %ymm0
|
||||
; CHECK-NEXT: vmovupd (%rdi), %ymm0 {%k1}
|
||||
; CHECK-NEXT: vmovupd (%rdi), %ymm1 {%k1} {z}
|
||||
; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 -1)
|
||||
%res1 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> %res, i8 %mask)
|
||||
%res2 = call <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8* %ptr, <4 x double> zeroinitializer, i8 %mask)
|
||||
%res4 = fadd <4 x double> %res2, %res1
|
||||
ret <4 x double> %res4
|
||||
}
|
||||
|
||||
declare <4 x double> @llvm.x86.avx512.mask.loadu.pd.256(i8*, <4 x double>, i8)
|
||||
|
||||
define <4 x float> @test_mask_load_aligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
|
||||
; CHECK-LABEL: test_mask_load_aligned_ps_128:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movzbl %sil, %eax
|
||||
; CHECK-NEXT: kmovw %eax, %k1
|
||||
; CHECK-NEXT: vmovaps (%rdi), %xmm0
|
||||
; CHECK-NEXT: vmovaps (%rdi), %xmm0 {%k1}
|
||||
; CHECK-NEXT: vmovaps (%rdi), %xmm1 {%k1} {z}
|
||||
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1)
|
||||
%res1 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> %res, i8 %mask)
|
||||
%res2 = call <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask)
|
||||
%res4 = fadd <4 x float> %res2, %res1
|
||||
ret <4 x float> %res4
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.avx512.mask.load.ps.128(i8*, <4 x float>, i8)
|
||||
|
||||
define <4 x float> @test_mask_load_unaligned_ps_128(<4 x float> %data, i8* %ptr, i8 %mask) {
|
||||
; CHECK-LABEL: test_mask_load_unaligned_ps_128:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movzbl %sil, %eax
|
||||
; CHECK-NEXT: kmovw %eax, %k1
|
||||
; CHECK-NEXT: vmovups (%rdi), %xmm0
|
||||
; CHECK-NEXT: vmovups (%rdi), %xmm0 {%k1}
|
||||
; CHECK-NEXT: vmovups (%rdi), %xmm1 {%k1} {z}
|
||||
; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 -1)
|
||||
%res1 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> %res, i8 %mask)
|
||||
%res2 = call <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8* %ptr, <4 x float> zeroinitializer, i8 %mask)
|
||||
%res4 = fadd <4 x float> %res2, %res1
|
||||
ret <4 x float> %res4
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.x86.avx512.mask.loadu.ps.128(i8*, <4 x float>, i8)
|
||||
|
||||
define <2 x double> @test_mask_load_aligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
|
||||
; CHECK-LABEL: test_mask_load_aligned_pd_128:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movzbl %sil, %eax
|
||||
; CHECK-NEXT: kmovw %eax, %k1
|
||||
; CHECK-NEXT: vmovapd (%rdi), %xmm0
|
||||
; CHECK-NEXT: vmovapd (%rdi), %xmm0 {%k1}
|
||||
; CHECK-NEXT: vmovapd (%rdi), %xmm1 {%k1} {z}
|
||||
; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1)
|
||||
%res1 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> %res, i8 %mask)
|
||||
%res2 = call <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask)
|
||||
%res4 = fadd <2 x double> %res2, %res1
|
||||
ret <2 x double> %res4
|
||||
}
|
||||
|
||||
declare <2 x double> @llvm.x86.avx512.mask.load.pd.128(i8*, <2 x double>, i8)
|
||||
|
||||
define <2 x double> @test_mask_load_unaligned_pd_128(<2 x double> %data, i8* %ptr, i8 %mask) {
|
||||
; CHECK-LABEL: test_mask_load_unaligned_pd_128:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: movzbl %sil, %eax
|
||||
; CHECK-NEXT: kmovw %eax, %k1
|
||||
; CHECK-NEXT: vmovupd (%rdi), %xmm0
|
||||
; CHECK-NEXT: vmovupd (%rdi), %xmm0 {%k1}
|
||||
; CHECK-NEXT: vmovupd (%rdi), %xmm1 {%k1} {z}
|
||||
; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0
|
||||
; CHECK-NEXT: retq
|
||||
%res = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 -1)
|
||||
%res1 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> %res, i8 %mask)
|
||||
%res2 = call <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8* %ptr, <2 x double> zeroinitializer, i8 %mask)
|
||||
%res4 = fadd <2 x double> %res2, %res1
|
||||
ret <2 x double> %res4
|
||||
}
|
||||
|
||||
declare <2 x double> @llvm.x86.avx512.mask.loadu.pd.128(i8*, <2 x double>, i8)
|
||||
|
||||
declare <4 x i32> @llvm.x86.avx512.mask.psrav4.si(<4 x i32>, <4 x i32>, <4 x i32>, i8)
|
||||
|
||||
define <4 x i32>@test_int_x86_avx512_mask_psrav4_si(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
|
||||
|
Loading…
Reference in New Issue
Block a user