mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-01 05:01:59 +01:00
Make sure that the DAG combiner doesn't merge stores that we explicitly
asked not be greater than preferred vector width for the vectorizer. Test for both 128 and 256 with a skylake architecture. llvm-svn: 360183
This commit is contained in:
parent
808bbbad60
commit
9f47186666
@ -2058,18 +2058,19 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
|
||||
/// source is constant so it does not need to be loaded.
|
||||
/// It returns EVT::Other if the type should be determined using generic
|
||||
/// target-independent logic.
|
||||
EVT
|
||||
X86TargetLowering::getOptimalMemOpType(
|
||||
/// For vector ops we check that the overall size isn't larger than our
|
||||
/// preferred vector width.
|
||||
EVT X86TargetLowering::getOptimalMemOpType(
|
||||
uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
|
||||
bool ZeroMemset, bool MemcpyStrSrc,
|
||||
const AttributeList &FuncAttributes) const {
|
||||
if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
|
||||
if (Size >= 16 &&
|
||||
(!Subtarget.isUnalignedMem16Slow() ||
|
||||
((DstAlign == 0 || DstAlign >= 16) &&
|
||||
(SrcAlign == 0 || SrcAlign >= 16)))) {
|
||||
if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
|
||||
((DstAlign == 0 || DstAlign >= 16) &&
|
||||
(SrcAlign == 0 || SrcAlign >= 16)))) {
|
||||
// FIXME: Check if unaligned 32-byte accesses are slow.
|
||||
if (Size >= 32 && Subtarget.hasAVX()) {
|
||||
if (Size >= 32 && Subtarget.hasAVX() &&
|
||||
(Subtarget.getPreferVectorWidth() >= 256)) {
|
||||
// Although this isn't a well-supported type for AVX1, we'll let
|
||||
// legalization and shuffle lowering produce the optimal codegen. If we
|
||||
// choose an optimal type with a vector element larger than a byte,
|
||||
@ -2077,11 +2078,12 @@ X86TargetLowering::getOptimalMemOpType(
|
||||
// multiply) before we splat as a vector.
|
||||
return MVT::v32i8;
|
||||
}
|
||||
if (Subtarget.hasSSE2())
|
||||
if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128))
|
||||
return MVT::v16i8;
|
||||
// TODO: Can SSE1 handle a byte vector?
|
||||
// If we have SSE1 registers we should be able to use them.
|
||||
if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()))
|
||||
if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
|
||||
(Subtarget.getPreferVectorWidth() >= 128))
|
||||
return MVT::v4f32;
|
||||
} else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
|
||||
!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
|
||||
@ -4963,6 +4965,10 @@ bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
|
||||
unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
|
||||
return (MemVT.getSizeInBits() <= MaxIntSize);
|
||||
}
|
||||
// Make sure we don't merge greater than our preferred vector
|
||||
// width.
|
||||
if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth())
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
53
test/CodeGen/X86/vector-width-store-merge.ll
Normal file
53
test/CodeGen/X86/vector-width-store-merge.ll
Normal file
@ -0,0 +1,53 @@
|
||||
; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
|
||||
|
||||
; This tests whether or not we generate vectors large than preferred vector width when
|
||||
; lowering memmove.
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define weak_odr dso_local void @A(i8* %src, i8* %dst) local_unnamed_addr #0 {
|
||||
entry:
|
||||
; CHECK: A
|
||||
; CHECK-NOT: vmovups %ymm
|
||||
; CHECK: vmovups %xmm
|
||||
call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 32, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define weak_odr dso_local void @B(i8* %src, i8* %dst) local_unnamed_addr #0 {
|
||||
entry:
|
||||
; CHECK: B
|
||||
; CHECK-NOT: vmovups %zmm
|
||||
; CHECK: vmovups %xmm
|
||||
call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 64, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define weak_odr dso_local void @C(i8* %src, i8* %dst) local_unnamed_addr #2 {
|
||||
entry:
|
||||
; CHECK: C
|
||||
; CHECK-NOT: vmovups %ymm
|
||||
; CHECK: vmovups %ymm
|
||||
call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 32, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind uwtable
|
||||
define weak_odr dso_local void @D(i8* %src, i8* %dst) local_unnamed_addr #2 {
|
||||
entry:
|
||||
; CHECK: D
|
||||
; CHECK-NOT: vmovups %zmm
|
||||
; CHECK: vmovups %ymm
|
||||
call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 %dst, i8* align 1 %src, i64 64, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: argmemonly nounwind
|
||||
declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1 immarg) #1
|
||||
|
||||
attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { argmemonly nounwind }
|
||||
attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="256" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
|
||||
!0 = !{i32 1, !"wchar_size", i32 4}
|
Loading…
x
Reference in New Issue
Block a user