From 99196f8bed1e6093eedd75fc73d46cc4446397cf Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 19 Aug 2017 23:21:22 +0000 Subject: [PATCH] [X86] Merge all of the vecload and alignedload predicates into single predicates. We can load the memory VT and check for natural alignment. This also adds a new preferNonTemporalLoad helper that checks the correct subtarget feature based on the load size. This shrinks the isel table by at least 5000 bytes by allowing more reordering and combining to occur. llvm-svn: 311266 --- lib/Target/X86/X86ISelDAGToDAG.cpp | 21 +++++++ lib/Target/X86/X86InstrFragmentsSIMD.td | 77 ++++++++++--------------- 2 files changed, 50 insertions(+), 48 deletions(-) diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 8f24f98be68..789d91175d3 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -399,6 +399,27 @@ namespace { return isInt(CN->getSExtValue()); return isSExtAbsoluteSymbolRef(Width, N); } + + // Indicates we should prefer to use a non-temporal load for this load. + bool useNonTemporalLoad(LoadSDNode *N) const { + if (!N->isNonTemporal()) + return false; + + unsigned StoreSize = N->getMemoryVT().getStoreSize(); + + if (N->getAlignment() < StoreSize) + return false; + + switch (StoreSize) { + default: llvm_unreachable("Unsupported store size"); + case 16: + return Subtarget->hasSSE41(); + case 32: + return Subtarget->hasAVX2(); + case 64: + return Subtarget->hasAVX512(); + } + } }; } diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 7f7a1a1ba30..e49fb62311d 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -647,35 +647,26 @@ def sdmem : Operand { // Vector load wrappers to prevent folding of non-temporal aligned loads on // supporting targets. -def vec128load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return !Subtarget->hasSSE41() || !cast(N)->isNonTemporal() || - cast(N)->getAlignment() < 16; -}]>; -def vec256load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return !Subtarget->hasAVX2() || !cast(N)->isNonTemporal() || - cast(N)->getAlignment() < 32; -}]>; -def vec512load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return !Subtarget->hasAVX512() || !cast(N)->isNonTemporal() || - cast(N)->getAlignment() < 64; +def vecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return !useNonTemporalLoad(cast(N)); }]>; // 128-bit load pattern fragments // NOTE: all 128-bit integer vector loads are promoted to v2i64 -def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (vec128load node:$ptr))>; -def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (vec128load node:$ptr))>; -def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (vec128load node:$ptr))>; +def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (vecload node:$ptr))>; +def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (vecload node:$ptr))>; +def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (vecload node:$ptr))>; // 256-bit load pattern fragments // NOTE: all 256-bit integer vector loads are promoted to v4i64 -def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (vec256load node:$ptr))>; -def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (vec256load node:$ptr))>; -def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (vec256load node:$ptr))>; +def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (vecload node:$ptr))>; +def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (vecload node:$ptr))>; +def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (vecload node:$ptr))>; // 512-bit load pattern fragments -def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (vec512load node:$ptr))>; -def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (vec512load node:$ptr))>; -def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (vec512load node:$ptr))>; +def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (vecload node:$ptr))>; +def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (vecload node:$ptr))>; +def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (vecload node:$ptr))>; // 128-/256-/512-bit extload pattern fragments def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; @@ -685,63 +676,53 @@ def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; // Like 'store', but always requires vector size alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ - StoreSDNode *St = cast(N); + auto *St = cast(N); return St->getAlignment() >= St->getMemoryVT().getStoreSize(); }]>; // Like 'load', but always requires 128-bit vector alignment. -def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return cast(N)->getAlignment() >= 16 && - (!Subtarget->hasSSE41() || !cast(N)->isNonTemporal()); -}]>; - -// Like 'load', but always requires 256-bit vector alignment. -def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return cast(N)->getAlignment() >= 32 && - (!Subtarget->hasAVX2() || !cast(N)->isNonTemporal()); -}]>; - -// Like 'load', but always requires 512-bit vector alignment. -def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return cast(N)->getAlignment() >= 64 && - (!Subtarget->hasAVX512() || !cast(N)->isNonTemporal()); +def alignedvecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + auto *Ld = cast(N); + return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize() && + !useNonTemporalLoad(cast(N)); }]>; // 128-bit aligned load pattern fragments // NOTE: all 128-bit integer vector loads are promoted to v2i64 def alignedloadv4f32 : PatFrag<(ops node:$ptr), - (v4f32 (alignedload node:$ptr))>; + (v4f32 (alignedvecload node:$ptr))>; def alignedloadv2f64 : PatFrag<(ops node:$ptr), - (v2f64 (alignedload node:$ptr))>; + (v2f64 (alignedvecload node:$ptr))>; def alignedloadv2i64 : PatFrag<(ops node:$ptr), - (v2i64 (alignedload node:$ptr))>; + (v2i64 (alignedvecload node:$ptr))>; // 256-bit aligned load pattern fragments // NOTE: all 256-bit integer vector loads are promoted to v4i64 def alignedloadv8f32 : PatFrag<(ops node:$ptr), - (v8f32 (alignedload256 node:$ptr))>; + (v8f32 (alignedvecload node:$ptr))>; def alignedloadv4f64 : PatFrag<(ops node:$ptr), - (v4f64 (alignedload256 node:$ptr))>; + (v4f64 (alignedvecload node:$ptr))>; def alignedloadv4i64 : PatFrag<(ops node:$ptr), - (v4i64 (alignedload256 node:$ptr))>; + (v4i64 (alignedvecload node:$ptr))>; // 512-bit aligned load pattern fragments def alignedloadv16f32 : PatFrag<(ops node:$ptr), - (v16f32 (alignedload512 node:$ptr))>; + (v16f32 (alignedvecload node:$ptr))>; def alignedloadv8f64 : PatFrag<(ops node:$ptr), - (v8f64 (alignedload512 node:$ptr))>; + (v8f64 (alignedvecload node:$ptr))>; def alignedloadv8i64 : PatFrag<(ops node:$ptr), - (v8i64 (alignedload512 node:$ptr))>; + (v8i64 (alignedvecload node:$ptr))>; -// Like 'vec128load', but uses special alignment checks suitable for use in +// Like 'vecload', but uses special alignment checks suitable for use in // memory operands in most SSE instructions, which are required to // be naturally aligned on some targets but not on others. If the subtarget // allows unaligned accesses, match any load, though this may require // setting a feature bit in the processor (on startup, for example). // Opteron 10h and later implement such a feature. -def memop : PatFrag<(ops node:$ptr), (vec128load node:$ptr), [{ +def memop : PatFrag<(ops node:$ptr), (vecload node:$ptr), [{ + auto *Ld = cast(N); return Subtarget->hasSSEUnalignedMem() || - cast(N)->getAlignment() >= 16; + Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize(); }]>; // 128-bit memop pattern fragments