mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
[X86][SSE] Add general lowering of nontemporal vector loads
Currently the only way to use the (V)MOVNTDQA nontemporal vector loads instructions is through the int_x86_sse41_movntdqa style builtins. This patch adds support for lowering nontemporal loads from general IR, allowing us to remove the movntdqa builtins in a future patch. We currently still fold nontemporal loads into suitable instructions, we should probably look at removing this (and nontemporal stores as well) or at least make the target's folding implementation aware that its dealing with a nontemporal memory transaction. There is also an issue that VMOVNTDQA only acts on 128-bit vectors on pre-AVX2 hardware - so currently a normal ymm load is still used on AVX1 targets. Differential Review: http://reviews.llvm.org/D20965 llvm-svn: 272010
This commit is contained in:
parent
e1c6a997f8
commit
67ca4cba96
@ -953,6 +953,18 @@ def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
|
|||||||
return St->getAlignment() < St->getMemoryVT().getStoreSize();
|
return St->getAlignment() < St->getMemoryVT().getStoreSize();
|
||||||
}]>;
|
}]>;
|
||||||
|
|
||||||
|
// nontemporal load fragments.
|
||||||
|
def nontemporalload : PatFrag<(ops node:$ptr),
|
||||||
|
(load node:$ptr), [{
|
||||||
|
return cast<LoadSDNode>(N)->isNonTemporal();
|
||||||
|
}]>;
|
||||||
|
|
||||||
|
def alignednontemporalload : PatFrag<(ops node:$ptr),
|
||||||
|
(nontemporalload node:$ptr), [{
|
||||||
|
LoadSDNode *Ld = cast<LoadSDNode>(N);
|
||||||
|
return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
|
||||||
|
}]>;
|
||||||
|
|
||||||
// setcc convenience fragments.
|
// setcc convenience fragments.
|
||||||
def setoeq : PatFrag<(ops node:$lhs, node:$rhs),
|
def setoeq : PatFrag<(ops node:$lhs, node:$rhs),
|
||||||
(setcc node:$lhs, node:$rhs, SETOEQ)>;
|
(setcc node:$lhs, node:$rhs, SETOEQ)>;
|
||||||
|
@ -3301,6 +3301,19 @@ let Predicates = [HasAVX512], AddedComplexity = 400 in {
|
|||||||
(VMOVNTDQZmr addr:$dst, VR512:$src)>;
|
(VMOVNTDQZmr addr:$dst, VR512:$src)>;
|
||||||
def : Pat<(alignednontemporalstore (v64i8 VR512:$src), addr:$dst),
|
def : Pat<(alignednontemporalstore (v64i8 VR512:$src), addr:$dst),
|
||||||
(VMOVNTDQZmr addr:$dst, VR512:$src)>;
|
(VMOVNTDQZmr addr:$dst, VR512:$src)>;
|
||||||
|
|
||||||
|
def : Pat<(v8f64 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZrm addr:$src)>;
|
||||||
|
def : Pat<(v16f32 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZrm addr:$src)>;
|
||||||
|
def : Pat<(v8i64 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZrm addr:$src)>;
|
||||||
|
def : Pat<(v16i32 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZrm addr:$src)>;
|
||||||
|
def : Pat<(v32i16 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZrm addr:$src)>;
|
||||||
|
def : Pat<(v64i8 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZrm addr:$src)>;
|
||||||
}
|
}
|
||||||
|
|
||||||
let Predicates = [HasVLX], AddedComplexity = 400 in {
|
let Predicates = [HasVLX], AddedComplexity = 400 in {
|
||||||
@ -3311,12 +3324,38 @@ let Predicates = [HasVLX], AddedComplexity = 400 in {
|
|||||||
def : Pat<(alignednontemporalstore (v32i8 VR256X:$src), addr:$dst),
|
def : Pat<(alignednontemporalstore (v32i8 VR256X:$src), addr:$dst),
|
||||||
(VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
|
(VMOVNTDQZ256mr addr:$dst, VR256X:$src)>;
|
||||||
|
|
||||||
|
def : Pat<(v4f64 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZ256rm addr:$src)>;
|
||||||
|
def : Pat<(v8f32 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZ256rm addr:$src)>;
|
||||||
|
def : Pat<(v4i64 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZ256rm addr:$src)>;
|
||||||
|
def : Pat<(v8i32 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZ256rm addr:$src)>;
|
||||||
|
def : Pat<(v16i16 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZ256rm addr:$src)>;
|
||||||
|
def : Pat<(v32i8 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZ256rm addr:$src)>;
|
||||||
|
|
||||||
def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst),
|
def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst),
|
||||||
(VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
|
(VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
|
||||||
def : Pat<(alignednontemporalstore (v8i16 VR128X:$src), addr:$dst),
|
def : Pat<(alignednontemporalstore (v8i16 VR128X:$src), addr:$dst),
|
||||||
(VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
|
(VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
|
||||||
def : Pat<(alignednontemporalstore (v16i8 VR128X:$src), addr:$dst),
|
def : Pat<(alignednontemporalstore (v16i8 VR128X:$src), addr:$dst),
|
||||||
(VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
|
(VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
|
||||||
|
|
||||||
|
def : Pat<(v2f64 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZ128rm addr:$src)>;
|
||||||
|
def : Pat<(v4f32 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZ128rm addr:$src)>;
|
||||||
|
def : Pat<(v2i64 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZ128rm addr:$src)>;
|
||||||
|
def : Pat<(v4i32 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZ128rm addr:$src)>;
|
||||||
|
def : Pat<(v8i16 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZ128rm addr:$src)>;
|
||||||
|
def : Pat<(v16i8 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAZ128rm addr:$src)>;
|
||||||
}
|
}
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
@ -7245,6 +7245,7 @@ let Predicates = [UseSSE41] in {
|
|||||||
(BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
|
(BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let AddedComplexity = 400 in { // Prefer non-temporal versions
|
||||||
let SchedRW = [WriteLoad] in {
|
let SchedRW = [WriteLoad] in {
|
||||||
let Predicates = [HasAVX, NoVLX] in
|
let Predicates = [HasAVX, NoVLX] in
|
||||||
def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
|
def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
|
||||||
@ -7261,6 +7262,35 @@ def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
|
|||||||
[(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
|
[(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
|
||||||
} // SchedRW
|
} // SchedRW
|
||||||
|
|
||||||
|
let Predicates = [HasAVX2, NoVLX] in {
|
||||||
|
def : Pat<(v8f32 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAYrm addr:$src)>;
|
||||||
|
def : Pat<(v4f64 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAYrm addr:$src)>;
|
||||||
|
def : Pat<(v4i64 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQAYrm addr:$src)>;
|
||||||
|
}
|
||||||
|
|
||||||
|
let Predicates = [HasAVX, NoVLX] in {
|
||||||
|
def : Pat<(v4f32 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQArm addr:$src)>;
|
||||||
|
def : Pat<(v2f64 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQArm addr:$src)>;
|
||||||
|
def : Pat<(v2i64 (alignednontemporalload addr:$src)),
|
||||||
|
(VMOVNTDQArm addr:$src)>;
|
||||||
|
}
|
||||||
|
|
||||||
|
let Predicates = [UseSSE41] in {
|
||||||
|
def : Pat<(v4f32 (alignednontemporalload addr:$src)),
|
||||||
|
(MOVNTDQArm addr:$src)>;
|
||||||
|
def : Pat<(v2f64 (alignednontemporalload addr:$src)),
|
||||||
|
(MOVNTDQArm addr:$src)>;
|
||||||
|
def : Pat<(v2i64 (alignednontemporalload addr:$src)),
|
||||||
|
(MOVNTDQArm addr:$src)>;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // AddedComplexity
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
// SSE4.2 - Compare Instructions
|
// SSE4.2 - Compare Instructions
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
@ -220,19 +220,29 @@ entry:
|
|||||||
;
|
;
|
||||||
|
|
||||||
define <4 x float> @test_load_nt4xfloat(<4 x float>* nocapture %ptr) {
|
define <4 x float> @test_load_nt4xfloat(<4 x float>* nocapture %ptr) {
|
||||||
; SSE-LABEL: test_load_nt4xfloat:
|
; SSE2-LABEL: test_load_nt4xfloat:
|
||||||
; SSE: # BB#0: # %entry
|
; SSE2: # BB#0: # %entry
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE4A-LABEL: test_load_nt4xfloat:
|
||||||
|
; SSE4A: # BB#0: # %entry
|
||||||
|
; SSE4A-NEXT: movaps (%rdi), %xmm0
|
||||||
|
; SSE4A-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_load_nt4xfloat:
|
||||||
|
; SSE41: # BB#0: # %entry
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_load_nt4xfloat:
|
; AVX-LABEL: test_load_nt4xfloat:
|
||||||
; AVX: # BB#0: # %entry
|
; AVX: # BB#0: # %entry
|
||||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_load_nt4xfloat:
|
; AVX512-LABEL: test_load_nt4xfloat:
|
||||||
; AVX512: # BB#0: # %entry
|
; AVX512: # BB#0: # %entry
|
||||||
; AVX512-NEXT: vmovaps (%rdi), %xmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
%0 = load <4 x float>, <4 x float>* %ptr, align 16, !nontemporal !1
|
%0 = load <4 x float>, <4 x float>* %ptr, align 16, !nontemporal !1
|
||||||
@ -240,19 +250,29 @@ entry:
|
|||||||
}
|
}
|
||||||
|
|
||||||
define <2 x double> @test_load_nt2xdouble(<2 x double>* nocapture %ptr) {
|
define <2 x double> @test_load_nt2xdouble(<2 x double>* nocapture %ptr) {
|
||||||
; SSE-LABEL: test_load_nt2xdouble:
|
; SSE2-LABEL: test_load_nt2xdouble:
|
||||||
; SSE: # BB#0: # %entry
|
; SSE2: # BB#0: # %entry
|
||||||
; SSE-NEXT: movapd (%rdi), %xmm0
|
; SSE2-NEXT: movapd (%rdi), %xmm0
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE4A-LABEL: test_load_nt2xdouble:
|
||||||
|
; SSE4A: # BB#0: # %entry
|
||||||
|
; SSE4A-NEXT: movapd (%rdi), %xmm0
|
||||||
|
; SSE4A-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_load_nt2xdouble:
|
||||||
|
; SSE41: # BB#0: # %entry
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_load_nt2xdouble:
|
; AVX-LABEL: test_load_nt2xdouble:
|
||||||
; AVX: # BB#0: # %entry
|
; AVX: # BB#0: # %entry
|
||||||
; AVX-NEXT: vmovapd (%rdi), %xmm0
|
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_load_nt2xdouble:
|
; AVX512-LABEL: test_load_nt2xdouble:
|
||||||
; AVX512: # BB#0: # %entry
|
; AVX512: # BB#0: # %entry
|
||||||
; AVX512-NEXT: vmovapd (%rdi), %xmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
%0 = load <2 x double>, <2 x double>* %ptr, align 16, !nontemporal !1
|
%0 = load <2 x double>, <2 x double>* %ptr, align 16, !nontemporal !1
|
||||||
@ -262,17 +282,17 @@ entry:
|
|||||||
define <16 x i8> @test_load_nt16xi8(<16 x i8>* nocapture %ptr) {
|
define <16 x i8> @test_load_nt16xi8(<16 x i8>* nocapture %ptr) {
|
||||||
; SSE-LABEL: test_load_nt16xi8:
|
; SSE-LABEL: test_load_nt16xi8:
|
||||||
; SSE: # BB#0: # %entry
|
; SSE: # BB#0: # %entry
|
||||||
; SSE-NEXT: movdqa (%rdi), %xmm0
|
; SSE-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_load_nt16xi8:
|
; AVX-LABEL: test_load_nt16xi8:
|
||||||
; AVX: # BB#0: # %entry
|
; AVX: # BB#0: # %entry
|
||||||
; AVX-NEXT: vmovdqa (%rdi), %xmm0
|
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_load_nt16xi8:
|
; AVX512-LABEL: test_load_nt16xi8:
|
||||||
; AVX512: # BB#0: # %entry
|
; AVX512: # BB#0: # %entry
|
||||||
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
%0 = load <16 x i8>, <16 x i8>* %ptr, align 16, !nontemporal !1
|
%0 = load <16 x i8>, <16 x i8>* %ptr, align 16, !nontemporal !1
|
||||||
@ -282,17 +302,17 @@ entry:
|
|||||||
define <8 x i16> @test_load_nt8xi16(<8 x i16>* nocapture %ptr) {
|
define <8 x i16> @test_load_nt8xi16(<8 x i16>* nocapture %ptr) {
|
||||||
; SSE-LABEL: test_load_nt8xi16:
|
; SSE-LABEL: test_load_nt8xi16:
|
||||||
; SSE: # BB#0: # %entry
|
; SSE: # BB#0: # %entry
|
||||||
; SSE-NEXT: movdqa (%rdi), %xmm0
|
; SSE-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_load_nt8xi16:
|
; AVX-LABEL: test_load_nt8xi16:
|
||||||
; AVX: # BB#0: # %entry
|
; AVX: # BB#0: # %entry
|
||||||
; AVX-NEXT: vmovdqa (%rdi), %xmm0
|
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_load_nt8xi16:
|
; AVX512-LABEL: test_load_nt8xi16:
|
||||||
; AVX512: # BB#0: # %entry
|
; AVX512: # BB#0: # %entry
|
||||||
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
%0 = load <8 x i16>, <8 x i16>* %ptr, align 16, !nontemporal !1
|
%0 = load <8 x i16>, <8 x i16>* %ptr, align 16, !nontemporal !1
|
||||||
@ -302,17 +322,17 @@ entry:
|
|||||||
define <4 x i32> @test_load_nt4xi32(<4 x i32>* nocapture %ptr) {
|
define <4 x i32> @test_load_nt4xi32(<4 x i32>* nocapture %ptr) {
|
||||||
; SSE-LABEL: test_load_nt4xi32:
|
; SSE-LABEL: test_load_nt4xi32:
|
||||||
; SSE: # BB#0: # %entry
|
; SSE: # BB#0: # %entry
|
||||||
; SSE-NEXT: movdqa (%rdi), %xmm0
|
; SSE-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_load_nt4xi32:
|
; AVX-LABEL: test_load_nt4xi32:
|
||||||
; AVX: # BB#0: # %entry
|
; AVX: # BB#0: # %entry
|
||||||
; AVX-NEXT: vmovdqa (%rdi), %xmm0
|
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_load_nt4xi32:
|
; AVX512-LABEL: test_load_nt4xi32:
|
||||||
; AVX512: # BB#0: # %entry
|
; AVX512: # BB#0: # %entry
|
||||||
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
%0 = load <4 x i32>, <4 x i32>* %ptr, align 16, !nontemporal !1
|
%0 = load <4 x i32>, <4 x i32>* %ptr, align 16, !nontemporal !1
|
||||||
@ -322,17 +342,17 @@ entry:
|
|||||||
define <2 x i64> @test_load_nt2xi64(<2 x i64>* nocapture %ptr) {
|
define <2 x i64> @test_load_nt2xi64(<2 x i64>* nocapture %ptr) {
|
||||||
; SSE-LABEL: test_load_nt2xi64:
|
; SSE-LABEL: test_load_nt2xi64:
|
||||||
; SSE: # BB#0: # %entry
|
; SSE: # BB#0: # %entry
|
||||||
; SSE-NEXT: movdqa (%rdi), %xmm0
|
; SSE-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; SSE-NEXT: retq
|
; SSE-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_load_nt2xi64:
|
; AVX-LABEL: test_load_nt2xi64:
|
||||||
; AVX: # BB#0: # %entry
|
; AVX: # BB#0: # %entry
|
||||||
; AVX-NEXT: vmovdqa (%rdi), %xmm0
|
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_load_nt2xi64:
|
; AVX512-LABEL: test_load_nt2xi64:
|
||||||
; AVX512: # BB#0: # %entry
|
; AVX512: # BB#0: # %entry
|
||||||
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
%0 = load <2 x i64>, <2 x i64>* %ptr, align 16, !nontemporal !1
|
%0 = load <2 x i64>, <2 x i64>* %ptr, align 16, !nontemporal !1
|
||||||
@ -480,20 +500,37 @@ entry:
|
|||||||
;
|
;
|
||||||
|
|
||||||
define <8 x float> @test_load_nt8xfloat(<8 x float>* nocapture %ptr) {
|
define <8 x float> @test_load_nt8xfloat(<8 x float>* nocapture %ptr) {
|
||||||
; SSE-LABEL: test_load_nt8xfloat:
|
; SSE2-LABEL: test_load_nt8xfloat:
|
||||||
; SSE: # BB#0: # %entry
|
; SSE2: # BB#0: # %entry
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_load_nt8xfloat:
|
; SSE4A-LABEL: test_load_nt8xfloat:
|
||||||
; AVX: # BB#0: # %entry
|
; SSE4A: # BB#0: # %entry
|
||||||
; AVX-NEXT: vmovaps (%rdi), %ymm0
|
; SSE4A-NEXT: movaps (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
|
||||||
|
; SSE4A-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_load_nt8xfloat:
|
||||||
|
; SSE41: # BB#0: # %entry
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_load_nt8xfloat:
|
||||||
|
; AVX1: # BB#0: # %entry
|
||||||
|
; AVX1-NEXT: vmovaps (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_load_nt8xfloat:
|
||||||
|
; AVX2: # BB#0: # %entry
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_load_nt8xfloat:
|
; AVX512-LABEL: test_load_nt8xfloat:
|
||||||
; AVX512: # BB#0: # %entry
|
; AVX512: # BB#0: # %entry
|
||||||
; AVX512-NEXT: vmovaps (%rdi), %ymm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
%0 = load <8 x float>, <8 x float>* %ptr, align 32, !nontemporal !1
|
%0 = load <8 x float>, <8 x float>* %ptr, align 32, !nontemporal !1
|
||||||
@ -501,20 +538,37 @@ entry:
|
|||||||
}
|
}
|
||||||
|
|
||||||
define <4 x double> @test_load_nt4xdouble(<4 x double>* nocapture %ptr) {
|
define <4 x double> @test_load_nt4xdouble(<4 x double>* nocapture %ptr) {
|
||||||
; SSE-LABEL: test_load_nt4xdouble:
|
; SSE2-LABEL: test_load_nt4xdouble:
|
||||||
; SSE: # BB#0: # %entry
|
; SSE2: # BB#0: # %entry
|
||||||
; SSE-NEXT: movapd (%rdi), %xmm0
|
; SSE2-NEXT: movapd (%rdi), %xmm0
|
||||||
; SSE-NEXT: movapd 16(%rdi), %xmm1
|
; SSE2-NEXT: movapd 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_load_nt4xdouble:
|
; SSE4A-LABEL: test_load_nt4xdouble:
|
||||||
; AVX: # BB#0: # %entry
|
; SSE4A: # BB#0: # %entry
|
||||||
; AVX-NEXT: vmovapd (%rdi), %ymm0
|
; SSE4A-NEXT: movapd (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; SSE4A-NEXT: movapd 16(%rdi), %xmm1
|
||||||
|
; SSE4A-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_load_nt4xdouble:
|
||||||
|
; SSE41: # BB#0: # %entry
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_load_nt4xdouble:
|
||||||
|
; AVX1: # BB#0: # %entry
|
||||||
|
; AVX1-NEXT: vmovapd (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_load_nt4xdouble:
|
||||||
|
; AVX2: # BB#0: # %entry
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_load_nt4xdouble:
|
; AVX512-LABEL: test_load_nt4xdouble:
|
||||||
; AVX512: # BB#0: # %entry
|
; AVX512: # BB#0: # %entry
|
||||||
; AVX512-NEXT: vmovapd (%rdi), %ymm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
%0 = load <4 x double>, <4 x double>* %ptr, align 32, !nontemporal !1
|
%0 = load <4 x double>, <4 x double>* %ptr, align 32, !nontemporal !1
|
||||||
@ -522,20 +576,37 @@ entry:
|
|||||||
}
|
}
|
||||||
|
|
||||||
define <32 x i8> @test_load_nt32xi8(<32 x i8>* nocapture %ptr) {
|
define <32 x i8> @test_load_nt32xi8(<32 x i8>* nocapture %ptr) {
|
||||||
; SSE-LABEL: test_load_nt32xi8:
|
; SSE2-LABEL: test_load_nt32xi8:
|
||||||
; SSE: # BB#0: # %entry
|
; SSE2: # BB#0: # %entry
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_load_nt32xi8:
|
; SSE4A-LABEL: test_load_nt32xi8:
|
||||||
; AVX: # BB#0: # %entry
|
; SSE4A: # BB#0: # %entry
|
||||||
; AVX-NEXT: vmovdqa (%rdi), %ymm0
|
; SSE4A-NEXT: movaps (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
|
||||||
|
; SSE4A-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_load_nt32xi8:
|
||||||
|
; SSE41: # BB#0: # %entry
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_load_nt32xi8:
|
||||||
|
; AVX1: # BB#0: # %entry
|
||||||
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_load_nt32xi8:
|
||||||
|
; AVX2: # BB#0: # %entry
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_load_nt32xi8:
|
; AVX512-LABEL: test_load_nt32xi8:
|
||||||
; AVX512: # BB#0: # %entry
|
; AVX512: # BB#0: # %entry
|
||||||
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
%0 = load <32 x i8>, <32 x i8>* %ptr, align 32, !nontemporal !1
|
%0 = load <32 x i8>, <32 x i8>* %ptr, align 32, !nontemporal !1
|
||||||
@ -543,20 +614,37 @@ entry:
|
|||||||
}
|
}
|
||||||
|
|
||||||
define <16 x i16> @test_load_nt16xi16(<16 x i16>* nocapture %ptr) {
|
define <16 x i16> @test_load_nt16xi16(<16 x i16>* nocapture %ptr) {
|
||||||
; SSE-LABEL: test_load_nt16xi16:
|
; SSE2-LABEL: test_load_nt16xi16:
|
||||||
; SSE: # BB#0: # %entry
|
; SSE2: # BB#0: # %entry
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_load_nt16xi16:
|
; SSE4A-LABEL: test_load_nt16xi16:
|
||||||
; AVX: # BB#0: # %entry
|
; SSE4A: # BB#0: # %entry
|
||||||
; AVX-NEXT: vmovdqa (%rdi), %ymm0
|
; SSE4A-NEXT: movaps (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
|
||||||
|
; SSE4A-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_load_nt16xi16:
|
||||||
|
; SSE41: # BB#0: # %entry
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_load_nt16xi16:
|
||||||
|
; AVX1: # BB#0: # %entry
|
||||||
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_load_nt16xi16:
|
||||||
|
; AVX2: # BB#0: # %entry
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_load_nt16xi16:
|
; AVX512-LABEL: test_load_nt16xi16:
|
||||||
; AVX512: # BB#0: # %entry
|
; AVX512: # BB#0: # %entry
|
||||||
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
%0 = load <16 x i16>, <16 x i16>* %ptr, align 32, !nontemporal !1
|
%0 = load <16 x i16>, <16 x i16>* %ptr, align 32, !nontemporal !1
|
||||||
@ -564,20 +652,37 @@ entry:
|
|||||||
}
|
}
|
||||||
|
|
||||||
define <8 x i32> @test_load_nt8xi32(<8 x i32>* nocapture %ptr) {
|
define <8 x i32> @test_load_nt8xi32(<8 x i32>* nocapture %ptr) {
|
||||||
; SSE-LABEL: test_load_nt8xi32:
|
; SSE2-LABEL: test_load_nt8xi32:
|
||||||
; SSE: # BB#0: # %entry
|
; SSE2: # BB#0: # %entry
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_load_nt8xi32:
|
; SSE4A-LABEL: test_load_nt8xi32:
|
||||||
; AVX: # BB#0: # %entry
|
; SSE4A: # BB#0: # %entry
|
||||||
; AVX-NEXT: vmovdqa (%rdi), %ymm0
|
; SSE4A-NEXT: movaps (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
|
||||||
|
; SSE4A-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_load_nt8xi32:
|
||||||
|
; SSE41: # BB#0: # %entry
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_load_nt8xi32:
|
||||||
|
; AVX1: # BB#0: # %entry
|
||||||
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_load_nt8xi32:
|
||||||
|
; AVX2: # BB#0: # %entry
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_load_nt8xi32:
|
; AVX512-LABEL: test_load_nt8xi32:
|
||||||
; AVX512: # BB#0: # %entry
|
; AVX512: # BB#0: # %entry
|
||||||
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
%0 = load <8 x i32>, <8 x i32>* %ptr, align 32, !nontemporal !1
|
%0 = load <8 x i32>, <8 x i32>* %ptr, align 32, !nontemporal !1
|
||||||
@ -585,20 +690,37 @@ entry:
|
|||||||
}
|
}
|
||||||
|
|
||||||
define <4 x i64> @test_load_nt4xi64(<4 x i64>* nocapture %ptr) {
|
define <4 x i64> @test_load_nt4xi64(<4 x i64>* nocapture %ptr) {
|
||||||
; SSE-LABEL: test_load_nt4xi64:
|
; SSE2-LABEL: test_load_nt4xi64:
|
||||||
; SSE: # BB#0: # %entry
|
; SSE2: # BB#0: # %entry
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_load_nt4xi64:
|
; SSE4A-LABEL: test_load_nt4xi64:
|
||||||
; AVX: # BB#0: # %entry
|
; SSE4A: # BB#0: # %entry
|
||||||
; AVX-NEXT: vmovdqa (%rdi), %ymm0
|
; SSE4A-NEXT: movaps (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
|
||||||
|
; SSE4A-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_load_nt4xi64:
|
||||||
|
; SSE41: # BB#0: # %entry
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_load_nt4xi64:
|
||||||
|
; AVX1: # BB#0: # %entry
|
||||||
|
; AVX1-NEXT: vmovdqa (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_load_nt4xi64:
|
||||||
|
; AVX2: # BB#0: # %entry
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_load_nt4xi64:
|
; AVX512-LABEL: test_load_nt4xi64:
|
||||||
; AVX512: # BB#0: # %entry
|
; AVX512: # BB#0: # %entry
|
||||||
; AVX512-NEXT: vmovdqa (%rdi), %ymm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
%0 = load <4 x i64>, <4 x i64>* %ptr, align 32, !nontemporal !1
|
%0 = load <4 x i64>, <4 x i64>* %ptr, align 32, !nontemporal !1
|
||||||
@ -776,23 +898,45 @@ entry:
|
|||||||
;
|
;
|
||||||
|
|
||||||
define <16 x float> @test_load_nt16xfloat(<16 x float>* nocapture %ptr) {
|
define <16 x float> @test_load_nt16xfloat(<16 x float>* nocapture %ptr) {
|
||||||
; SSE-LABEL: test_load_nt16xfloat:
|
; SSE2-LABEL: test_load_nt16xfloat:
|
||||||
; SSE: # BB#0: # %entry
|
; SSE2: # BB#0: # %entry
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: movaps 32(%rdi), %xmm2
|
; SSE2-NEXT: movaps 32(%rdi), %xmm2
|
||||||
; SSE-NEXT: movaps 48(%rdi), %xmm3
|
; SSE2-NEXT: movaps 48(%rdi), %xmm3
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_load_nt16xfloat:
|
; SSE4A-LABEL: test_load_nt16xfloat:
|
||||||
; AVX: # BB#0: # %entry
|
; SSE4A: # BB#0: # %entry
|
||||||
; AVX-NEXT: vmovaps (%rdi), %ymm0
|
; SSE4A-NEXT: movaps (%rdi), %xmm0
|
||||||
; AVX-NEXT: vmovaps 32(%rdi), %ymm1
|
; SSE4A-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; AVX-NEXT: retq
|
; SSE4A-NEXT: movaps 32(%rdi), %xmm2
|
||||||
|
; SSE4A-NEXT: movaps 48(%rdi), %xmm3
|
||||||
|
; SSE4A-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_load_nt16xfloat:
|
||||||
|
; SSE41: # BB#0: # %entry
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
|
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
|
||||||
|
; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_load_nt16xfloat:
|
||||||
|
; AVX1: # BB#0: # %entry
|
||||||
|
; AVX1-NEXT: vmovaps (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_load_nt16xfloat:
|
||||||
|
; AVX2: # BB#0: # %entry
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_load_nt16xfloat:
|
; AVX512-LABEL: test_load_nt16xfloat:
|
||||||
; AVX512: # BB#0: # %entry
|
; AVX512: # BB#0: # %entry
|
||||||
; AVX512-NEXT: vmovaps (%rdi), %zmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
%0 = load <16 x float>, <16 x float>* %ptr, align 64, !nontemporal !1
|
%0 = load <16 x float>, <16 x float>* %ptr, align 64, !nontemporal !1
|
||||||
@ -800,23 +944,45 @@ entry:
|
|||||||
}
|
}
|
||||||
|
|
||||||
define <8 x double> @test_load_nt8xdouble(<8 x double>* nocapture %ptr) {
|
define <8 x double> @test_load_nt8xdouble(<8 x double>* nocapture %ptr) {
|
||||||
; SSE-LABEL: test_load_nt8xdouble:
|
; SSE2-LABEL: test_load_nt8xdouble:
|
||||||
; SSE: # BB#0: # %entry
|
; SSE2: # BB#0: # %entry
|
||||||
; SSE-NEXT: movapd (%rdi), %xmm0
|
; SSE2-NEXT: movapd (%rdi), %xmm0
|
||||||
; SSE-NEXT: movapd 16(%rdi), %xmm1
|
; SSE2-NEXT: movapd 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: movapd 32(%rdi), %xmm2
|
; SSE2-NEXT: movapd 32(%rdi), %xmm2
|
||||||
; SSE-NEXT: movapd 48(%rdi), %xmm3
|
; SSE2-NEXT: movapd 48(%rdi), %xmm3
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_load_nt8xdouble:
|
; SSE4A-LABEL: test_load_nt8xdouble:
|
||||||
; AVX: # BB#0: # %entry
|
; SSE4A: # BB#0: # %entry
|
||||||
; AVX-NEXT: vmovapd (%rdi), %ymm0
|
; SSE4A-NEXT: movapd (%rdi), %xmm0
|
||||||
; AVX-NEXT: vmovapd 32(%rdi), %ymm1
|
; SSE4A-NEXT: movapd 16(%rdi), %xmm1
|
||||||
; AVX-NEXT: retq
|
; SSE4A-NEXT: movapd 32(%rdi), %xmm2
|
||||||
|
; SSE4A-NEXT: movapd 48(%rdi), %xmm3
|
||||||
|
; SSE4A-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_load_nt8xdouble:
|
||||||
|
; SSE41: # BB#0: # %entry
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
|
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
|
||||||
|
; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_load_nt8xdouble:
|
||||||
|
; AVX1: # BB#0: # %entry
|
||||||
|
; AVX1-NEXT: vmovapd (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: vmovapd 32(%rdi), %ymm1
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_load_nt8xdouble:
|
||||||
|
; AVX2: # BB#0: # %entry
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_load_nt8xdouble:
|
; AVX512-LABEL: test_load_nt8xdouble:
|
||||||
; AVX512: # BB#0: # %entry
|
; AVX512: # BB#0: # %entry
|
||||||
; AVX512-NEXT: vmovapd (%rdi), %zmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
entry:
|
entry:
|
||||||
%0 = load <8 x double>, <8 x double>* %ptr, align 64, !nontemporal !1
|
%0 = load <8 x double>, <8 x double>* %ptr, align 64, !nontemporal !1
|
||||||
|
@ -7,46 +7,54 @@
|
|||||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512BW
|
||||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512VL
|
||||||
|
|
||||||
; FIXME: Tests for nontemporal load support which was introduced in SSE41
|
|
||||||
|
|
||||||
define <4 x float> @test_v4f32(<4 x float>* %src) {
|
define <4 x float> @test_v4f32(<4 x float>* %src) {
|
||||||
; SSE-LABEL: test_v4f32:
|
; SSE2-LABEL: test_v4f32:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_v4f32:
|
||||||
|
; SSE41: # BB#0:
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v4f32:
|
; AVX-LABEL: test_v4f32:
|
||||||
; AVX: # BB#0:
|
; AVX: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_v4f32:
|
; AVX512-LABEL: test_v4f32:
|
||||||
; AVX512: # BB#0:
|
; AVX512: # BB#0:
|
||||||
; AVX512-NEXT: vmovaps (%rdi), %xmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
%1 = load <4 x float>, <4 x float>* %src, align 16, !nontemporal !1
|
%1 = load <4 x float>, <4 x float>* %src, align 16, !nontemporal !1
|
||||||
ret <4 x float> %1
|
ret <4 x float> %1
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x i32> @test_v4i32(<4 x i32>* %src) {
|
define <4 x i32> @test_v4i32(<4 x i32>* %src) {
|
||||||
; SSE-LABEL: test_v4i32:
|
; SSE2-LABEL: test_v4i32:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_v4i32:
|
||||||
|
; SSE41: # BB#0:
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v4i32:
|
; AVX-LABEL: test_v4i32:
|
||||||
; AVX: # BB#0:
|
; AVX: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: test_v4i32:
|
; AVX512F-LABEL: test_v4i32:
|
||||||
; AVX512F: # BB#0:
|
; AVX512F: # BB#0:
|
||||||
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
|
; AVX512F-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX512F-NEXT: retq
|
; AVX512F-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512BW-LABEL: test_v4i32:
|
; AVX512BW-LABEL: test_v4i32:
|
||||||
; AVX512BW: # BB#0:
|
; AVX512BW: # BB#0:
|
||||||
; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
|
; AVX512BW-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX512BW-NEXT: retq
|
; AVX512BW-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512VL-LABEL: test_v4i32:
|
; AVX512VL-LABEL: test_v4i32:
|
||||||
@ -58,117 +66,97 @@ define <4 x i32> @test_v4i32(<4 x i32>* %src) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
define <2 x double> @test_v2f64(<2 x double>* %src) {
|
define <2 x double> @test_v2f64(<2 x double>* %src) {
|
||||||
; SSE-LABEL: test_v2f64:
|
; SSE2-LABEL: test_v2f64:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_v2f64:
|
||||||
|
; SSE41: # BB#0:
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v2f64:
|
; AVX-LABEL: test_v2f64:
|
||||||
; AVX: # BB#0:
|
; AVX: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: test_v2f64:
|
; AVX512-LABEL: test_v2f64:
|
||||||
; AVX512F: # BB#0:
|
; AVX512: # BB#0:
|
||||||
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX512F-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
;
|
|
||||||
; AVX512BW-LABEL: test_v2f64:
|
|
||||||
; AVX512BW: # BB#0:
|
|
||||||
; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
|
|
||||||
; AVX512BW-NEXT: retq
|
|
||||||
;
|
|
||||||
; AVX512VL-LABEL: test_v2f64:
|
|
||||||
; AVX512VL: # BB#0:
|
|
||||||
; AVX512VL-NEXT: vmovapd (%rdi), %xmm0
|
|
||||||
; AVX512VL-NEXT: retq
|
|
||||||
%1 = load <2 x double>, <2 x double>* %src, align 16, !nontemporal !1
|
%1 = load <2 x double>, <2 x double>* %src, align 16, !nontemporal !1
|
||||||
ret <2 x double> %1
|
ret <2 x double> %1
|
||||||
}
|
}
|
||||||
|
|
||||||
define <2 x i64> @test_v2i64(<2 x i64>* %src) {
|
define <2 x i64> @test_v2i64(<2 x i64>* %src) {
|
||||||
; SSE-LABEL: test_v2i64:
|
; SSE2-LABEL: test_v2i64:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_v2i64:
|
||||||
|
; SSE41: # BB#0:
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v2i64:
|
; AVX-LABEL: test_v2i64:
|
||||||
; AVX: # BB#0:
|
; AVX: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: test_v2i64:
|
; AVX512-LABEL: test_v2i64:
|
||||||
; AVX512F: # BB#0:
|
; AVX512: # BB#0:
|
||||||
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX512F-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
;
|
|
||||||
; AVX512BW-LABEL: test_v2i64:
|
|
||||||
; AVX512BW: # BB#0:
|
|
||||||
; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
|
|
||||||
; AVX512BW-NEXT: retq
|
|
||||||
;
|
|
||||||
; AVX512VL-LABEL: test_v2i64:
|
|
||||||
; AVX512VL: # BB#0:
|
|
||||||
; AVX512VL-NEXT: vmovdqa64 (%rdi), %xmm0
|
|
||||||
; AVX512VL-NEXT: retq
|
|
||||||
%1 = load <2 x i64>, <2 x i64>* %src, align 16, !nontemporal !1
|
%1 = load <2 x i64>, <2 x i64>* %src, align 16, !nontemporal !1
|
||||||
ret <2 x i64> %1
|
ret <2 x i64> %1
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x i16> @test_v8i16(<8 x i16>* %src) {
|
define <8 x i16> @test_v8i16(<8 x i16>* %src) {
|
||||||
; SSE-LABEL: test_v8i16:
|
; SSE2-LABEL: test_v8i16:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_v8i16:
|
||||||
|
; SSE41: # BB#0:
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v8i16:
|
; AVX-LABEL: test_v8i16:
|
||||||
; AVX: # BB#0:
|
; AVX: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: test_v8i16:
|
; AVX512-LABEL: test_v8i16:
|
||||||
; AVX512F: # BB#0:
|
; AVX512: # BB#0:
|
||||||
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX512F-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
;
|
|
||||||
; AVX512BW-LABEL: test_v8i16:
|
|
||||||
; AVX512BW: # BB#0:
|
|
||||||
; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
|
|
||||||
; AVX512BW-NEXT: retq
|
|
||||||
;
|
|
||||||
; AVX512VL-LABEL: test_v8i16:
|
|
||||||
; AVX512VL: # BB#0:
|
|
||||||
; AVX512VL-NEXT: vmovdqa64 (%rdi), %xmm0
|
|
||||||
; AVX512VL-NEXT: retq
|
|
||||||
%1 = load <8 x i16>, <8 x i16>* %src, align 16, !nontemporal !1
|
%1 = load <8 x i16>, <8 x i16>* %src, align 16, !nontemporal !1
|
||||||
ret <8 x i16> %1
|
ret <8 x i16> %1
|
||||||
}
|
}
|
||||||
|
|
||||||
define <16 x i8> @test_v16i8(<16 x i8>* %src) {
|
define <16 x i8> @test_v16i8(<16 x i8>* %src) {
|
||||||
; SSE-LABEL: test_v16i8:
|
; SSE2-LABEL: test_v16i8:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
|
;
|
||||||
|
; SSE41-LABEL: test_v16i8:
|
||||||
|
; SSE41: # BB#0:
|
||||||
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
|
; SSE41-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v16i8:
|
; AVX-LABEL: test_v16i8:
|
||||||
; AVX: # BB#0:
|
; AVX: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %xmm0
|
; AVX-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; AVX-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: test_v16i8:
|
; AVX512-LABEL: test_v16i8:
|
||||||
; AVX512F: # BB#0:
|
; AVX512: # BB#0:
|
||||||
; AVX512F-NEXT: vmovaps (%rdi), %xmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %xmm0
|
||||||
; AVX512F-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
;
|
|
||||||
; AVX512BW-LABEL: test_v16i8:
|
|
||||||
; AVX512BW: # BB#0:
|
|
||||||
; AVX512BW-NEXT: vmovaps (%rdi), %xmm0
|
|
||||||
; AVX512BW-NEXT: retq
|
|
||||||
;
|
|
||||||
; AVX512VL-LABEL: test_v16i8:
|
|
||||||
; AVX512VL: # BB#0:
|
|
||||||
; AVX512VL-NEXT: vmovdqa64 (%rdi), %xmm0
|
|
||||||
; AVX512VL-NEXT: retq
|
|
||||||
%1 = load <16 x i8>, <16 x i8>* %src, align 16, !nontemporal !1
|
%1 = load <16 x i8>, <16 x i8>* %src, align 16, !nontemporal !1
|
||||||
ret <16 x i8> %1
|
ret <16 x i8> %1
|
||||||
}
|
}
|
||||||
@ -176,45 +164,67 @@ define <16 x i8> @test_v16i8(<16 x i8>* %src) {
|
|||||||
; And now YMM versions.
|
; And now YMM versions.
|
||||||
|
|
||||||
define <8 x float> @test_v8f32(<8 x float>* %src) {
|
define <8 x float> @test_v8f32(<8 x float>* %src) {
|
||||||
; SSE-LABEL: test_v8f32:
|
; SSE2-LABEL: test_v8f32:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v8f32:
|
; SSE41-LABEL: test_v8f32:
|
||||||
; AVX: # BB#0:
|
; SSE41: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %ymm0
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_v8f32:
|
||||||
|
; AVX1: # BB#0:
|
||||||
|
; AVX1-NEXT: vmovaps (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_v8f32:
|
||||||
|
; AVX2: # BB#0:
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_v8f32:
|
; AVX512-LABEL: test_v8f32:
|
||||||
; AVX512: # BB#0:
|
; AVX512: # BB#0:
|
||||||
; AVX512-NEXT: vmovaps (%rdi), %ymm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
%1 = load <8 x float>, <8 x float>* %src, align 32, !nontemporal !1
|
%1 = load <8 x float>, <8 x float>* %src, align 32, !nontemporal !1
|
||||||
ret <8 x float> %1
|
ret <8 x float> %1
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x i32> @test_v8i32(<8 x i32>* %src) {
|
define <8 x i32> @test_v8i32(<8 x i32>* %src) {
|
||||||
; SSE-LABEL: test_v8i32:
|
; SSE2-LABEL: test_v8i32:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v8i32:
|
; SSE41-LABEL: test_v8i32:
|
||||||
; AVX: # BB#0:
|
; SSE41: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %ymm0
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_v8i32:
|
||||||
|
; AVX1: # BB#0:
|
||||||
|
; AVX1-NEXT: vmovaps (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_v8i32:
|
||||||
|
; AVX2: # BB#0:
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: test_v8i32:
|
; AVX512F-LABEL: test_v8i32:
|
||||||
; AVX512F: # BB#0:
|
; AVX512F: # BB#0:
|
||||||
; AVX512F-NEXT: vmovaps (%rdi), %ymm0
|
; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512F-NEXT: retq
|
; AVX512F-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512BW-LABEL: test_v8i32:
|
; AVX512BW-LABEL: test_v8i32:
|
||||||
; AVX512BW: # BB#0:
|
; AVX512BW: # BB#0:
|
||||||
; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
|
; AVX512BW-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512BW-NEXT: retq
|
; AVX512BW-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512VL-LABEL: test_v8i32:
|
; AVX512VL-LABEL: test_v8i32:
|
||||||
@ -226,121 +236,125 @@ define <8 x i32> @test_v8i32(<8 x i32>* %src) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
define <4 x double> @test_v4f64(<4 x double>* %src) {
|
define <4 x double> @test_v4f64(<4 x double>* %src) {
|
||||||
; SSE-LABEL: test_v4f64:
|
; SSE2-LABEL: test_v4f64:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v4f64:
|
; SSE41-LABEL: test_v4f64:
|
||||||
; AVX: # BB#0:
|
; SSE41: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %ymm0
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
|
; SSE41-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: test_v4f64:
|
; AVX1-LABEL: test_v4f64:
|
||||||
; AVX512F: # BB#0:
|
; AVX1: # BB#0:
|
||||||
; AVX512F-NEXT: vmovaps (%rdi), %ymm0
|
; AVX1-NEXT: vmovaps (%rdi), %ymm0
|
||||||
; AVX512F-NEXT: retq
|
; AVX1-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512BW-LABEL: test_v4f64:
|
; AVX2-LABEL: test_v4f64:
|
||||||
; AVX512BW: # BB#0:
|
; AVX2: # BB#0:
|
||||||
; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512BW-NEXT: retq
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512VL-LABEL: test_v4f64:
|
; AVX512-LABEL: test_v4f64:
|
||||||
; AVX512VL: # BB#0:
|
; AVX512: # BB#0:
|
||||||
; AVX512VL-NEXT: vmovapd (%rdi), %ymm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512VL-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
%1 = load <4 x double>, <4 x double>* %src, align 32, !nontemporal !1
|
%1 = load <4 x double>, <4 x double>* %src, align 32, !nontemporal !1
|
||||||
ret <4 x double> %1
|
ret <4 x double> %1
|
||||||
}
|
}
|
||||||
|
|
||||||
define <4 x i64> @test_v4i64(<4 x i64>* %src) {
|
define <4 x i64> @test_v4i64(<4 x i64>* %src) {
|
||||||
; SSE-LABEL: test_v4i64:
|
; SSE2-LABEL: test_v4i64:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v4i64:
|
; SSE41-LABEL: test_v4i64:
|
||||||
; AVX: # BB#0:
|
; SSE41: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %ymm0
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
|
; SSE41-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: test_v4i64:
|
; AVX1-LABEL: test_v4i64:
|
||||||
; AVX512F: # BB#0:
|
; AVX1: # BB#0:
|
||||||
; AVX512F-NEXT: vmovaps (%rdi), %ymm0
|
; AVX1-NEXT: vmovaps (%rdi), %ymm0
|
||||||
; AVX512F-NEXT: retq
|
; AVX1-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512BW-LABEL: test_v4i64:
|
; AVX2-LABEL: test_v4i64:
|
||||||
; AVX512BW: # BB#0:
|
; AVX2: # BB#0:
|
||||||
; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512BW-NEXT: retq
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512VL-LABEL: test_v4i64:
|
; AVX512-LABEL: test_v4i64:
|
||||||
; AVX512VL: # BB#0:
|
; AVX512: # BB#0:
|
||||||
; AVX512VL-NEXT: vmovdqa64 (%rdi), %ymm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512VL-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
%1 = load <4 x i64>, <4 x i64>* %src, align 32, !nontemporal !1
|
%1 = load <4 x i64>, <4 x i64>* %src, align 32, !nontemporal !1
|
||||||
ret <4 x i64> %1
|
ret <4 x i64> %1
|
||||||
}
|
}
|
||||||
|
|
||||||
define <16 x i16> @test_v16i16(<16 x i16>* %src) {
|
define <16 x i16> @test_v16i16(<16 x i16>* %src) {
|
||||||
; SSE-LABEL: test_v16i16:
|
; SSE2-LABEL: test_v16i16:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v16i16:
|
; SSE41-LABEL: test_v16i16:
|
||||||
; AVX: # BB#0:
|
; SSE41: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %ymm0
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
|
; SSE41-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: test_v16i16:
|
; AVX1-LABEL: test_v16i16:
|
||||||
; AVX512F: # BB#0:
|
; AVX1: # BB#0:
|
||||||
; AVX512F-NEXT: vmovaps (%rdi), %ymm0
|
; AVX1-NEXT: vmovaps (%rdi), %ymm0
|
||||||
; AVX512F-NEXT: retq
|
; AVX1-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512BW-LABEL: test_v16i16:
|
; AVX2-LABEL: test_v16i16:
|
||||||
; AVX512BW: # BB#0:
|
; AVX2: # BB#0:
|
||||||
; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512BW-NEXT: retq
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512VL-LABEL: test_v16i16:
|
; AVX512-LABEL: test_v16i16:
|
||||||
; AVX512VL: # BB#0:
|
; AVX512: # BB#0:
|
||||||
; AVX512VL-NEXT: vmovdqa64 (%rdi), %ymm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512VL-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
%1 = load <16 x i16>, <16 x i16>* %src, align 32, !nontemporal !1
|
%1 = load <16 x i16>, <16 x i16>* %src, align 32, !nontemporal !1
|
||||||
ret <16 x i16> %1
|
ret <16 x i16> %1
|
||||||
}
|
}
|
||||||
|
|
||||||
define <32 x i8> @test_v32i8(<32 x i8>* %src) {
|
define <32 x i8> @test_v32i8(<32 x i8>* %src) {
|
||||||
; SSE-LABEL: test_v32i8:
|
; SSE2-LABEL: test_v32i8:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v32i8:
|
; SSE41-LABEL: test_v32i8:
|
||||||
; AVX: # BB#0:
|
; SSE41: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %ymm0
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: retq
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
|
; SSE41-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: test_v32i8:
|
; AVX1-LABEL: test_v32i8:
|
||||||
; AVX512F: # BB#0:
|
; AVX1: # BB#0:
|
||||||
; AVX512F-NEXT: vmovaps (%rdi), %ymm0
|
; AVX1-NEXT: vmovaps (%rdi), %ymm0
|
||||||
; AVX512F-NEXT: retq
|
; AVX1-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512BW-LABEL: test_v32i8:
|
; AVX2-LABEL: test_v32i8:
|
||||||
; AVX512BW: # BB#0:
|
; AVX2: # BB#0:
|
||||||
; AVX512BW-NEXT: vmovaps (%rdi), %ymm0
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512BW-NEXT: retq
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512VL-LABEL: test_v32i8:
|
; AVX512-LABEL: test_v32i8:
|
||||||
; AVX512VL: # BB#0:
|
; AVX512: # BB#0:
|
||||||
; AVX512VL-NEXT: vmovdqa64 (%rdi), %ymm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512VL-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
%1 = load <32 x i8>, <32 x i8>* %src, align 32, !nontemporal !1
|
%1 = load <32 x i8>, <32 x i8>* %src, align 32, !nontemporal !1
|
||||||
ret <32 x i8> %1
|
ret <32 x i8> %1
|
||||||
}
|
}
|
||||||
@ -348,162 +362,246 @@ define <32 x i8> @test_v32i8(<32 x i8>* %src) {
|
|||||||
; And now ZMM versions.
|
; And now ZMM versions.
|
||||||
|
|
||||||
define <16 x float> @test_v16f32(<16 x float>* %src) {
|
define <16 x float> @test_v16f32(<16 x float>* %src) {
|
||||||
; SSE-LABEL: test_v16f32:
|
; SSE2-LABEL: test_v16f32:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: movaps 32(%rdi), %xmm2
|
; SSE2-NEXT: movaps 32(%rdi), %xmm2
|
||||||
; SSE-NEXT: movaps 48(%rdi), %xmm3
|
; SSE2-NEXT: movaps 48(%rdi), %xmm3
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v16f32:
|
; SSE41-LABEL: test_v16f32:
|
||||||
; AVX: # BB#0:
|
; SSE41: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %ymm0
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: vmovaps 32(%rdi), %ymm1
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
; AVX-NEXT: retq
|
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
|
||||||
|
; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_v16f32:
|
||||||
|
; AVX1: # BB#0:
|
||||||
|
; AVX1-NEXT: vmovaps (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_v16f32:
|
||||||
|
; AVX2: # BB#0:
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_v16f32:
|
; AVX512-LABEL: test_v16f32:
|
||||||
; AVX512: # BB#0:
|
; AVX512: # BB#0:
|
||||||
; AVX512-NEXT: vmovaps (%rdi), %zmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
%1 = load <16 x float>, <16 x float>* %src, align 64, !nontemporal !1
|
%1 = load <16 x float>, <16 x float>* %src, align 64, !nontemporal !1
|
||||||
ret <16 x float> %1
|
ret <16 x float> %1
|
||||||
}
|
}
|
||||||
|
|
||||||
define <16 x i32> @test_v16i32(<16 x i32>* %src) {
|
define <16 x i32> @test_v16i32(<16 x i32>* %src) {
|
||||||
; SSE-LABEL: test_v16i32:
|
; SSE2-LABEL: test_v16i32:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: movaps 32(%rdi), %xmm2
|
; SSE2-NEXT: movaps 32(%rdi), %xmm2
|
||||||
; SSE-NEXT: movaps 48(%rdi), %xmm3
|
; SSE2-NEXT: movaps 48(%rdi), %xmm3
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v16i32:
|
; SSE41-LABEL: test_v16i32:
|
||||||
; AVX: # BB#0:
|
; SSE41: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %ymm0
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: vmovaps 32(%rdi), %ymm1
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
; AVX-NEXT: retq
|
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
|
||||||
|
; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_v16i32:
|
||||||
|
; AVX1: # BB#0:
|
||||||
|
; AVX1-NEXT: vmovaps (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_v16i32:
|
||||||
|
; AVX2: # BB#0:
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_v16i32:
|
; AVX512-LABEL: test_v16i32:
|
||||||
; AVX512: # BB#0:
|
; AVX512: # BB#0:
|
||||||
; AVX512-NEXT: vmovdqa32 (%rdi), %zmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
%1 = load <16 x i32>, <16 x i32>* %src, align 64, !nontemporal !1
|
%1 = load <16 x i32>, <16 x i32>* %src, align 64, !nontemporal !1
|
||||||
ret <16 x i32> %1
|
ret <16 x i32> %1
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x double> @test_v8f64(<8 x double>* %src) {
|
define <8 x double> @test_v8f64(<8 x double>* %src) {
|
||||||
; SSE-LABEL: test_v8f64:
|
; SSE2-LABEL: test_v8f64:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: movaps 32(%rdi), %xmm2
|
; SSE2-NEXT: movaps 32(%rdi), %xmm2
|
||||||
; SSE-NEXT: movaps 48(%rdi), %xmm3
|
; SSE2-NEXT: movaps 48(%rdi), %xmm3
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v8f64:
|
; SSE41-LABEL: test_v8f64:
|
||||||
; AVX: # BB#0:
|
; SSE41: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %ymm0
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: vmovaps 32(%rdi), %ymm1
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
; AVX-NEXT: retq
|
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
|
||||||
|
; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_v8f64:
|
||||||
|
; AVX1: # BB#0:
|
||||||
|
; AVX1-NEXT: vmovaps (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_v8f64:
|
||||||
|
; AVX2: # BB#0:
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_v8f64:
|
; AVX512-LABEL: test_v8f64:
|
||||||
; AVX512: # BB#0:
|
; AVX512: # BB#0:
|
||||||
; AVX512-NEXT: vmovapd (%rdi), %zmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
%1 = load <8 x double>, <8 x double>* %src, align 64, !nontemporal !1
|
%1 = load <8 x double>, <8 x double>* %src, align 64, !nontemporal !1
|
||||||
ret <8 x double> %1
|
ret <8 x double> %1
|
||||||
}
|
}
|
||||||
|
|
||||||
define <8 x i64> @test_v8i64(<8 x i64>* %src) {
|
define <8 x i64> @test_v8i64(<8 x i64>* %src) {
|
||||||
; SSE-LABEL: test_v8i64:
|
; SSE2-LABEL: test_v8i64:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: movaps 32(%rdi), %xmm2
|
; SSE2-NEXT: movaps 32(%rdi), %xmm2
|
||||||
; SSE-NEXT: movaps 48(%rdi), %xmm3
|
; SSE2-NEXT: movaps 48(%rdi), %xmm3
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v8i64:
|
; SSE41-LABEL: test_v8i64:
|
||||||
; AVX: # BB#0:
|
; SSE41: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %ymm0
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: vmovaps 32(%rdi), %ymm1
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
; AVX-NEXT: retq
|
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
|
||||||
|
; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_v8i64:
|
||||||
|
; AVX1: # BB#0:
|
||||||
|
; AVX1-NEXT: vmovaps (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_v8i64:
|
||||||
|
; AVX2: # BB#0:
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512-LABEL: test_v8i64:
|
; AVX512-LABEL: test_v8i64:
|
||||||
; AVX512: # BB#0:
|
; AVX512: # BB#0:
|
||||||
; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
|
; AVX512-NEXT: vmovntdqa (%rdi), %zmm0
|
||||||
; AVX512-NEXT: retq
|
; AVX512-NEXT: retq
|
||||||
%1 = load <8 x i64>, <8 x i64>* %src, align 64, !nontemporal !1
|
%1 = load <8 x i64>, <8 x i64>* %src, align 64, !nontemporal !1
|
||||||
ret <8 x i64> %1
|
ret <8 x i64> %1
|
||||||
}
|
}
|
||||||
|
|
||||||
define <32 x i16> @test_v32i16(<32 x i16>* %src) {
|
define <32 x i16> @test_v32i16(<32 x i16>* %src) {
|
||||||
; SSE-LABEL: test_v32i16:
|
; SSE2-LABEL: test_v32i16:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: movaps 32(%rdi), %xmm2
|
; SSE2-NEXT: movaps 32(%rdi), %xmm2
|
||||||
; SSE-NEXT: movaps 48(%rdi), %xmm3
|
; SSE2-NEXT: movaps 48(%rdi), %xmm3
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v32i16:
|
; SSE41-LABEL: test_v32i16:
|
||||||
; AVX: # BB#0:
|
; SSE41: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %ymm0
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: vmovaps 32(%rdi), %ymm1
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
; AVX-NEXT: retq
|
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
|
||||||
|
; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_v32i16:
|
||||||
|
; AVX1: # BB#0:
|
||||||
|
; AVX1-NEXT: vmovaps (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_v32i16:
|
||||||
|
; AVX2: # BB#0:
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: test_v32i16:
|
; AVX512F-LABEL: test_v32i16:
|
||||||
; AVX512F: # BB#0:
|
; AVX512F: # BB#0:
|
||||||
; AVX512F-NEXT: vmovaps (%rdi), %ymm0
|
; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512F-NEXT: vmovaps 32(%rdi), %ymm1
|
; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
|
||||||
; AVX512F-NEXT: retq
|
; AVX512F-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512BW-LABEL: test_v32i16:
|
; AVX512BW-LABEL: test_v32i16:
|
||||||
; AVX512BW: # BB#0:
|
; AVX512BW: # BB#0:
|
||||||
; AVX512BW-NEXT: vmovdqu16 (%rdi), %zmm0
|
; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
|
||||||
; AVX512BW-NEXT: retq
|
; AVX512BW-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512VL-LABEL: test_v32i16:
|
; AVX512VL-LABEL: test_v32i16:
|
||||||
; AVX512VL: # BB#0:
|
; AVX512VL: # BB#0:
|
||||||
; AVX512VL-NEXT: vmovdqa64 (%rdi), %ymm0
|
; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512VL-NEXT: vmovdqa64 32(%rdi), %ymm1
|
; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
|
||||||
; AVX512VL-NEXT: retq
|
; AVX512VL-NEXT: retq
|
||||||
%1 = load <32 x i16>, <32 x i16>* %src, align 64, !nontemporal !1
|
%1 = load <32 x i16>, <32 x i16>* %src, align 64, !nontemporal !1
|
||||||
ret <32 x i16> %1
|
ret <32 x i16> %1
|
||||||
}
|
}
|
||||||
|
|
||||||
define <64 x i8> @test_v64i8(<64 x i8>* %src) {
|
define <64 x i8> @test_v64i8(<64 x i8>* %src) {
|
||||||
; SSE-LABEL: test_v64i8:
|
; SSE2-LABEL: test_v64i8:
|
||||||
; SSE: # BB#0:
|
; SSE2: # BB#0:
|
||||||
; SSE-NEXT: movaps (%rdi), %xmm0
|
; SSE2-NEXT: movaps (%rdi), %xmm0
|
||||||
; SSE-NEXT: movaps 16(%rdi), %xmm1
|
; SSE2-NEXT: movaps 16(%rdi), %xmm1
|
||||||
; SSE-NEXT: movaps 32(%rdi), %xmm2
|
; SSE2-NEXT: movaps 32(%rdi), %xmm2
|
||||||
; SSE-NEXT: movaps 48(%rdi), %xmm3
|
; SSE2-NEXT: movaps 48(%rdi), %xmm3
|
||||||
; SSE-NEXT: retq
|
; SSE2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX-LABEL: test_v64i8:
|
; SSE41-LABEL: test_v64i8:
|
||||||
; AVX: # BB#0:
|
; SSE41: # BB#0:
|
||||||
; AVX-NEXT: vmovaps (%rdi), %ymm0
|
; SSE41-NEXT: movntdqa (%rdi), %xmm0
|
||||||
; AVX-NEXT: vmovaps 32(%rdi), %ymm1
|
; SSE41-NEXT: movntdqa 16(%rdi), %xmm1
|
||||||
; AVX-NEXT: retq
|
; SSE41-NEXT: movntdqa 32(%rdi), %xmm2
|
||||||
|
; SSE41-NEXT: movntdqa 48(%rdi), %xmm3
|
||||||
|
; SSE41-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX1-LABEL: test_v64i8:
|
||||||
|
; AVX1: # BB#0:
|
||||||
|
; AVX1-NEXT: vmovaps (%rdi), %ymm0
|
||||||
|
; AVX1-NEXT: vmovaps 32(%rdi), %ymm1
|
||||||
|
; AVX1-NEXT: retq
|
||||||
|
;
|
||||||
|
; AVX2-LABEL: test_v64i8:
|
||||||
|
; AVX2: # BB#0:
|
||||||
|
; AVX2-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
|
; AVX2-NEXT: vmovntdqa 32(%rdi), %ymm1
|
||||||
|
; AVX2-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512F-LABEL: test_v64i8:
|
; AVX512F-LABEL: test_v64i8:
|
||||||
; AVX512F: # BB#0:
|
; AVX512F: # BB#0:
|
||||||
; AVX512F-NEXT: vmovaps (%rdi), %ymm0
|
; AVX512F-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512F-NEXT: vmovaps 32(%rdi), %ymm1
|
; AVX512F-NEXT: vmovntdqa 32(%rdi), %ymm1
|
||||||
; AVX512F-NEXT: retq
|
; AVX512F-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512BW-LABEL: test_v64i8:
|
; AVX512BW-LABEL: test_v64i8:
|
||||||
; AVX512BW: # BB#0:
|
; AVX512BW: # BB#0:
|
||||||
; AVX512BW-NEXT: vmovdqu8 (%rdi), %zmm0
|
; AVX512BW-NEXT: vmovntdqa (%rdi), %zmm0
|
||||||
; AVX512BW-NEXT: retq
|
; AVX512BW-NEXT: retq
|
||||||
;
|
;
|
||||||
; AVX512VL-LABEL: test_v64i8:
|
; AVX512VL-LABEL: test_v64i8:
|
||||||
; AVX512VL: # BB#0:
|
; AVX512VL: # BB#0:
|
||||||
; AVX512VL-NEXT: vmovdqa64 (%rdi), %ymm0
|
; AVX512VL-NEXT: vmovntdqa (%rdi), %ymm0
|
||||||
; AVX512VL-NEXT: vmovdqa64 32(%rdi), %ymm1
|
; AVX512VL-NEXT: vmovntdqa 32(%rdi), %ymm1
|
||||||
; AVX512VL-NEXT: retq
|
; AVX512VL-NEXT: retq
|
||||||
%1 = load <64 x i8>, <64 x i8>* %src, align 64, !nontemporal !1
|
%1 = load <64 x i8>, <64 x i8>* %src, align 64, !nontemporal !1
|
||||||
ret <64 x i8> %1
|
ret <64 x i8> %1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user