mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 11:13:28 +01:00
[X86][SSE] Select domain for 32/64-bit partial loads for EltsFromConsecutiveLoads
Choose between MOVD/MOVSS and MOVQ/MOVSD depending on the target vector type. This has a lot fewer test changes than trying to add this to X86InstrInfo::setExecutionDomain..... llvm-svn: 259816
This commit is contained in:
parent
09f022feb9
commit
cc76b8656c
@ -5642,44 +5642,46 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
|
||||
|
||||
// VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs.
|
||||
if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 64 &&
|
||||
((VT.is128BitVector() && TLI.isTypeLegal(MVT::v2i64)) ||
|
||||
(VT.is256BitVector() && TLI.isTypeLegal(MVT::v4i64)) ||
|
||||
(VT.is512BitVector() && TLI.isTypeLegal(MVT::v8i64)))) {
|
||||
MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
|
||||
SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
|
||||
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
|
||||
SDValue ResNode =
|
||||
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
|
||||
LDBase->getPointerInfo(),
|
||||
LDBase->getAlignment(),
|
||||
false/*isVolatile*/, true/*ReadMem*/,
|
||||
false/*WriteMem*/);
|
||||
((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
|
||||
MVT VecSVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
|
||||
MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 64);
|
||||
if (TLI.isTypeLegal(VecVT)) {
|
||||
SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
|
||||
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
|
||||
SDValue ResNode =
|
||||
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
|
||||
LDBase->getPointerInfo(),
|
||||
LDBase->getAlignment(),
|
||||
false/*isVolatile*/, true/*ReadMem*/,
|
||||
false/*WriteMem*/);
|
||||
|
||||
// Make sure the newly-created LOAD is in the same position as LDBase in
|
||||
// terms of dependency. We create a TokenFactor for LDBase and ResNode, and
|
||||
// update uses of LDBase's output chain to use the TokenFactor.
|
||||
if (LDBase->hasAnyUseOfValue(1)) {
|
||||
SDValue NewChain =
|
||||
DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
|
||||
SDValue(ResNode.getNode(), 1));
|
||||
DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
|
||||
DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
|
||||
SDValue(ResNode.getNode(), 1));
|
||||
// Make sure the newly-created LOAD is in the same position as LDBase in
|
||||
// terms of dependency. We create a TokenFactor for LDBase and ResNode,
|
||||
// and update uses of LDBase's output chain to use the TokenFactor.
|
||||
if (LDBase->hasAnyUseOfValue(1)) {
|
||||
SDValue NewChain =
|
||||
DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
|
||||
SDValue(ResNode.getNode(), 1));
|
||||
DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
|
||||
DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
|
||||
SDValue(ResNode.getNode(), 1));
|
||||
}
|
||||
|
||||
return DAG.getBitcast(VT, ResNode);
|
||||
}
|
||||
|
||||
return DAG.getBitcast(VT, ResNode);
|
||||
}
|
||||
|
||||
// VZEXT_MOVL - consecutive 32-bit load/undefs followed by zeros/undefs.
|
||||
if (IsConsecutiveLoad && FirstLoadedElt == 0 && LoadSize == 32 &&
|
||||
((VT.is128BitVector() && TLI.isTypeLegal(MVT::v4i32)) ||
|
||||
(VT.is256BitVector() && TLI.isTypeLegal(MVT::v8i32)) ||
|
||||
(VT.is512BitVector() && TLI.isTypeLegal(MVT::v16i32)))) {
|
||||
MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
|
||||
SDValue V = CreateLoad(MVT::i32, LDBase);
|
||||
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V);
|
||||
V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V);
|
||||
return DAG.getBitcast(VT, V);
|
||||
((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
|
||||
MVT VecSVT = VT.isFloatingPoint() ? MVT::f32 : MVT::i32;
|
||||
MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / 32);
|
||||
if (TLI.isTypeLegal(VecVT)) {
|
||||
SDValue V = CreateLoad(VecSVT, LDBase);
|
||||
V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, V);
|
||||
V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, V);
|
||||
return DAG.getBitcast(VT, V);
|
||||
}
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
|
@ -3046,6 +3046,8 @@ let Predicates = [HasAVX512] in {
|
||||
def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
|
||||
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
|
||||
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
|
||||
def : Pat<(v4f64 (X86vzload addr:$src)),
|
||||
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
|
||||
|
||||
// Represent the same patterns above but in the form they appear for
|
||||
// 512-bit types
|
||||
@ -3058,6 +3060,8 @@ let Predicates = [HasAVX512] in {
|
||||
def : Pat<(v8f64 (X86vzmovl (insert_subvector undef,
|
||||
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
|
||||
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
|
||||
def : Pat<(v8f64 (X86vzload addr:$src)),
|
||||
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
|
||||
}
|
||||
def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
|
||||
(v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))),
|
||||
|
@ -649,6 +649,8 @@ let Predicates = [UseAVX] in {
|
||||
def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
|
||||
(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
|
||||
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
|
||||
def : Pat<(v4f64 (X86vzload addr:$src)),
|
||||
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
|
||||
}
|
||||
|
||||
// Extract and store.
|
||||
|
@ -133,12 +133,12 @@ define <4 x float> @insertps_undef_input1(<4 x float> %a0, <4 x float> %a1) {
|
||||
define <4 x float> @consecutive_load_insertps_04zz(float* %p) {
|
||||
; SSE-LABEL: consecutive_load_insertps_04zz:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: consecutive_load_insertps_04zz:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: retq
|
||||
%p0 = getelementptr inbounds float, float* %p, i64 1
|
||||
%p1 = getelementptr inbounds float, float* %p, i64 2
|
||||
|
@ -115,18 +115,18 @@ define <4 x float> @merge_4f32_f32_3zuu(float* %ptr) nounwind uwtable noinline s
|
||||
define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline ssp {
|
||||
; SSE-LABEL: merge_4f32_f32_34uu:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: merge_4f32_f32_34uu:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; X32-SSE-LABEL: merge_4f32_f32_34uu:
|
||||
; X32-SSE: # BB#0:
|
||||
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-SSE-NEXT: retl
|
||||
%ptr0 = getelementptr inbounds float, float* %ptr, i64 3
|
||||
%ptr1 = getelementptr inbounds float, float* %ptr, i64 4
|
||||
@ -140,23 +140,23 @@ define <4 x float> @merge_4f32_f32_34uu(float* %ptr) nounwind uwtable noinline s
|
||||
define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline ssp {
|
||||
; SSE-LABEL: merge_4f32_f32_34z6:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,0]
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: merge_4f32_f32_34z6:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,0]
|
||||
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[1,0]
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; X32-SSE-LABEL: merge_4f32_f32_34z6:
|
||||
; X32-SSE: # BB#0:
|
||||
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,0]
|
||||
; X32-SSE-NEXT: retl
|
||||
%ptr0 = getelementptr inbounds float, float* %ptr, i64 3
|
||||
@ -174,18 +174,18 @@ define <4 x float> @merge_4f32_f32_34z6(float* %ptr) nounwind uwtable noinline s
|
||||
define <4 x float> @merge_4f32_f32_45zz(float* %ptr) nounwind uwtable noinline ssp {
|
||||
; SSE-LABEL: merge_4f32_f32_45zz:
|
||||
; SSE: # BB#0:
|
||||
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; SSE-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: merge_4f32_f32_45zz:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; X32-SSE-LABEL: merge_4f32_f32_45zz:
|
||||
; X32-SSE: # BB#0:
|
||||
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-SSE-NEXT: retl
|
||||
%ptr0 = getelementptr inbounds float, float* %ptr, i64 4
|
||||
%ptr1 = getelementptr inbounds float, float* %ptr, i64 5
|
||||
@ -207,20 +207,20 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s
|
||||
;
|
||||
; SSE41-LABEL: merge_4f32_f32_012u:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: merge_4f32_f32_012u:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; X32-SSE-LABEL: merge_4f32_f32_012u:
|
||||
; X32-SSE: # BB#0:
|
||||
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
|
||||
; X32-SSE-NEXT: retl
|
||||
%ptr0 = getelementptr inbounds float, float* %ptr, i64 0
|
||||
@ -248,20 +248,20 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s
|
||||
;
|
||||
; SSE41-LABEL: merge_4f32_f32_019u:
|
||||
; SSE41: # BB#0:
|
||||
; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: merge_4f32_f32_019u:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; X32-SSE-LABEL: merge_4f32_f32_019u:
|
||||
; X32-SSE: # BB#0:
|
||||
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
|
||||
; X32-SSE-NEXT: retl
|
||||
%ptr0 = getelementptr inbounds float, float* %ptr, i64 0
|
||||
|
@ -298,11 +298,11 @@ define <8 x float> @merge_8f32_2f32_23z5(<2 x float>* %ptr) nounwind uwtable noi
|
||||
; X32-AVX-LABEL: merge_8f32_2f32_23z5:
|
||||
; X32-AVX: # BB#0:
|
||||
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
|
||||
; X32-AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
||||
; X32-AVX-NEXT: vmovupd 16(%eax), %xmm1
|
||||
; X32-AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; X32-AVX-NEXT: vmovupd 16(%eax), %xmm0
|
||||
; X32-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; X32-AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2
|
||||
; X32-AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0]
|
||||
; X32-AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
||||
; X32-AVX-NEXT: retl
|
||||
%ptr0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 2
|
||||
%ptr1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
|
||||
@ -338,13 +338,13 @@ define <8 x float> @merge_8f32_4f32_z2(<4 x float>* %ptr) nounwind uwtable noinl
|
||||
define <8 x float> @merge_8f32_f32_12zzuuzz(float* %ptr) nounwind uwtable noinline ssp {
|
||||
; AVX-LABEL: merge_8f32_f32_12zzuuzz:
|
||||
; AVX: # BB#0:
|
||||
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; AVX-NEXT: retq
|
||||
;
|
||||
; X32-AVX-LABEL: merge_8f32_f32_12zzuuzz:
|
||||
; X32-AVX: # BB#0:
|
||||
; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-AVX-NEXT: retl
|
||||
%ptr0 = getelementptr inbounds float, float* %ptr, i64 1
|
||||
%ptr1 = getelementptr inbounds float, float* %ptr, i64 2
|
||||
|
@ -271,13 +271,13 @@ define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline s
|
||||
define <16 x float> @merge_16f32_f32_89zzzuuuuuuuuuuuz(float* %ptr) nounwind uwtable noinline ssp {
|
||||
; ALL-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; ALL-NEXT: retq
|
||||
;
|
||||
; X32-AVX512F-LABEL: merge_16f32_f32_89zzzuuuuuuuuuuuz:
|
||||
; X32-AVX512F: # BB#0:
|
||||
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-AVX512F-NEXT: retl
|
||||
%ptr0 = getelementptr inbounds float, float* %ptr, i64 8
|
||||
%ptr1 = getelementptr inbounds float, float* %ptr, i64 9
|
||||
@ -347,7 +347,7 @@ define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(float* %ptr) nounwind uwta
|
||||
define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwtable noinline ssp {
|
||||
; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; ALL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; ALL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
||||
; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
@ -360,7 +360,7 @@ define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwta
|
||||
; X32-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF:
|
||||
; X32-AVX512F: # BB#0:
|
||||
; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; X32-AVX512F-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-AVX512F-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; X32-AVX512F-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; X32-AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
|
||||
; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
|
||||
|
@ -1,22 +1,28 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,-slow-unaligned-mem-32 | FileCheck %s --check-prefix=ALL --check-prefix=FAST32
|
||||
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+slow-unaligned-mem-32 | FileCheck %s --check-prefix=ALL --check-prefix=SLOW32
|
||||
|
||||
define <4 x float> @merge_2_floats(float* nocapture %p) nounwind readonly {
|
||||
; ALL-LABEL: merge_2_floats:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; ALL-NEXT: retq
|
||||
%tmp1 = load float, float* %p
|
||||
%vecins = insertelement <4 x float> undef, float %tmp1, i32 0
|
||||
%add.ptr = getelementptr float, float* %p, i32 1
|
||||
%tmp5 = load float, float* %add.ptr
|
||||
%vecins7 = insertelement <4 x float> %vecins, float %tmp5, i32 1
|
||||
ret <4 x float> %vecins7
|
||||
|
||||
; ALL-LABEL: merge_2_floats
|
||||
; ALL: vmovq
|
||||
; ALL-NEXT: retq
|
||||
}
|
||||
|
||||
; Test-case generated due to a crash when trying to treat loading the first
|
||||
; two i64s of a <4 x i64> as a load of two i32s.
|
||||
define <4 x i64> @merge_2_floats_into_4() {
|
||||
; ALL-LABEL: merge_2_floats_into_4:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: movq (%rax), %rax
|
||||
; ALL-NEXT: vmovups (%rax), %xmm0
|
||||
; ALL-NEXT: retq
|
||||
%1 = load i64*, i64** undef, align 8
|
||||
%2 = getelementptr inbounds i64, i64* %1, i64 0
|
||||
%3 = load i64, i64* %2
|
||||
@ -27,13 +33,13 @@ define <4 x i64> @merge_2_floats_into_4() {
|
||||
%8 = insertelement <4 x i64> %4, i64 %7, i32 1
|
||||
%9 = shufflevector <4 x i64> %8, <4 x i64> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
ret <4 x i64> %9
|
||||
|
||||
; ALL-LABEL: merge_2_floats_into_4
|
||||
; ALL: vmovups
|
||||
; ALL-NEXT: retq
|
||||
}
|
||||
|
||||
define <4 x float> @merge_4_floats(float* %ptr) {
|
||||
; ALL-LABEL: merge_4_floats:
|
||||
; ALL: # BB#0:
|
||||
; ALL-NEXT: vmovups (%rdi), %xmm0
|
||||
; ALL-NEXT: retq
|
||||
%a = load float, float* %ptr, align 8
|
||||
%vec = insertelement <4 x float> undef, float %a, i32 0
|
||||
%idx1 = getelementptr inbounds float, float* %ptr, i64 1
|
||||
@ -46,18 +52,24 @@ define <4 x float> @merge_4_floats(float* %ptr) {
|
||||
%d = load float, float* %idx5, align 8
|
||||
%vec6 = insertelement <4 x float> %vec4, float %d, i32 3
|
||||
ret <4 x float> %vec6
|
||||
|
||||
; ALL-LABEL: merge_4_floats
|
||||
; ALL: vmovups
|
||||
; ALL-NEXT: retq
|
||||
}
|
||||
|
||||
; PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 )
|
||||
; PR21710 ( http://llvm.org/bugs/show_bug.cgi?id=21710 )
|
||||
; Make sure that 32-byte vectors are handled efficiently.
|
||||
; If the target has slow 32-byte accesses, we should still generate
|
||||
; 16-byte loads.
|
||||
|
||||
define <8 x float> @merge_8_floats(float* %ptr) {
|
||||
; FAST32-LABEL: merge_8_floats:
|
||||
; FAST32: # BB#0:
|
||||
; FAST32-NEXT: vmovups (%rdi), %ymm0
|
||||
; FAST32-NEXT: retq
|
||||
;
|
||||
; SLOW32-LABEL: merge_8_floats:
|
||||
; SLOW32: # BB#0:
|
||||
; SLOW32-NEXT: vmovups (%rdi), %xmm0
|
||||
; SLOW32-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
|
||||
; SLOW32-NEXT: retq
|
||||
%a = load float, float* %ptr, align 4
|
||||
%vec = insertelement <8 x float> undef, float %a, i32 0
|
||||
%idx1 = getelementptr inbounds float, float* %ptr, i64 1
|
||||
@ -82,18 +94,19 @@ define <8 x float> @merge_8_floats(float* %ptr) {
|
||||
%h = load float, float* %idx13, align 4
|
||||
%vec14 = insertelement <8 x float> %vec12, float %h, i32 7
|
||||
ret <8 x float> %vec14
|
||||
|
||||
; ALL-LABEL: merge_8_floats
|
||||
|
||||
; FAST32: vmovups
|
||||
; FAST32-NEXT: retq
|
||||
|
||||
; SLOW32: vmovups
|
||||
; SLOW32-NEXT: vinsertf128
|
||||
; SLOW32-NEXT: retq
|
||||
}
|
||||
|
||||
define <4 x double> @merge_4_doubles(double* %ptr) {
|
||||
; FAST32-LABEL: merge_4_doubles:
|
||||
; FAST32: # BB#0:
|
||||
; FAST32-NEXT: vmovups (%rdi), %ymm0
|
||||
; FAST32-NEXT: retq
|
||||
;
|
||||
; SLOW32-LABEL: merge_4_doubles:
|
||||
; SLOW32: # BB#0:
|
||||
; SLOW32-NEXT: vmovups (%rdi), %xmm0
|
||||
; SLOW32-NEXT: vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
|
||||
; SLOW32-NEXT: retq
|
||||
%a = load double, double* %ptr, align 8
|
||||
%vec = insertelement <4 x double> undef, double %a, i32 0
|
||||
%idx1 = getelementptr inbounds double, double* %ptr, i64 1
|
||||
@ -106,20 +119,22 @@ define <4 x double> @merge_4_doubles(double* %ptr) {
|
||||
%d = load double, double* %idx5, align 8
|
||||
%vec6 = insertelement <4 x double> %vec4, double %d, i32 3
|
||||
ret <4 x double> %vec6
|
||||
|
||||
; ALL-LABEL: merge_4_doubles
|
||||
; FAST32: vmovups
|
||||
; FAST32-NEXT: retq
|
||||
|
||||
; SLOW32: vmovups
|
||||
; SLOW32-NEXT: vinsertf128
|
||||
; SLOW32-NEXT: retq
|
||||
}
|
||||
|
||||
; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 )
|
||||
; PR21771 ( http://llvm.org/bugs/show_bug.cgi?id=21771 )
|
||||
; Recognize and combine consecutive loads even when the
|
||||
; first of the combined loads is offset from the base address.
|
||||
define <4 x double> @merge_4_doubles_offset(double* %ptr) {
|
||||
; FAST32-LABEL: merge_4_doubles_offset:
|
||||
; FAST32: # BB#0:
|
||||
; FAST32-NEXT: vmovups 32(%rdi), %ymm0
|
||||
; FAST32-NEXT: retq
|
||||
;
|
||||
; SLOW32-LABEL: merge_4_doubles_offset:
|
||||
; SLOW32: # BB#0:
|
||||
; SLOW32-NEXT: vmovups 32(%rdi), %xmm0
|
||||
; SLOW32-NEXT: vinsertf128 $1, 48(%rdi), %ymm0, %ymm0
|
||||
; SLOW32-NEXT: retq
|
||||
%arrayidx4 = getelementptr inbounds double, double* %ptr, i64 4
|
||||
%arrayidx5 = getelementptr inbounds double, double* %ptr, i64 5
|
||||
%arrayidx6 = getelementptr inbounds double, double* %ptr, i64 6
|
||||
@ -133,13 +148,5 @@ define <4 x double> @merge_4_doubles_offset(double* %ptr) {
|
||||
%vecinit6 = insertelement <4 x double> %vecinit5, double %g, i32 2
|
||||
%vecinit7 = insertelement <4 x double> %vecinit6, double %h, i32 3
|
||||
ret <4 x double> %vecinit7
|
||||
|
||||
; ALL-LABEL: merge_4_doubles_offset
|
||||
; FAST32: vmovups
|
||||
; FAST32-NEXT: retq
|
||||
|
||||
; SLOW32: vmovups
|
||||
; SLOW32-NEXT: vinsertf128
|
||||
; SLOW32-NEXT: retq
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
; RUN: llc < %s -march=x86 -mattr=+sse2 -o %t
|
||||
; RUN: grep movss %t | count 1
|
||||
; RUN: grep movq %t | count 1
|
||||
; RUN: grep movsd %t | count 1
|
||||
; RUN: grep shufps %t | count 1
|
||||
|
||||
define <4 x float> @test(float %a, float %b, float %c) nounwind {
|
||||
|
Loading…
Reference in New Issue
Block a user