1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 20:23:11 +01:00

[AArch64] Gangup loads and stores for pairing.

Keep loads and stores together (target defines how many loads
and stores to gang up), such that it will help in pairing
and vectorization.

Differential Revision https://reviews.llvm.org/D46477

llvm-svn: 332482
This commit is contained in:
Sirish Pande 2018-05-16 15:36:52 +00:00
parent 57f6c9317c
commit f2ba0203b0
10 changed files with 122 additions and 27 deletions

View File

@ -1199,6 +1199,15 @@ public:
return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
}
/// \brief Get maximum # of store operations to be glued together
///
/// This function returns the maximum number of store operations permitted
/// to glue together during lowering of llvm.memcpy. The value is set by
// the target at the performance threshold for such a replacement.
virtual unsigned getMaxGluedStoresPerMemcpy() const {
return MaxGluedStoresPerMemcpy;
}
/// Get maximum # of load operations permitted for memcmp
///
/// This function returns the maximum number of load operations permitted
@ -2509,6 +2518,14 @@ protected:
/// constant size.
unsigned MaxStoresPerMemcpy;
/// \brief Specify max number of store instructions to glue in inlined memcpy.
///
/// When memcpy is inlined based on MaxStoresPerMemcpy, specify maximum number
/// of store instructions to keep together. This helps in pairing and
// vectorization later on.
unsigned MaxGluedStoresPerMemcpy = 0;
/// Maximum number of store operations that may be substituted for a call to
/// memcpy, used for functions with OptSize attribute.
unsigned MaxStoresPerMemcpyOptSize;

View File

@ -89,6 +89,14 @@ void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}
#define DEBUG_TYPE "selectiondag"
static cl::opt<bool> EnableMemCpyDAGOpt("enable-memcpy-dag-opt",
cl::Hidden, cl::init(true),
cl::desc("Gang up loads and stores generated by inlining of memcpy"));
static cl::opt<int> MaxLdStGlue("ldstmemcpy-glue-max",
cl::desc("Number limit for gluing ld/st of memcpy."),
cl::Hidden, cl::init(0));
static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) {
LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G););
}
@ -5218,6 +5226,31 @@ static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
return MF.getFunction().optForSize();
}
static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
SmallVector<SDValue, 32> &OutChains, unsigned From,
unsigned To, SmallVector<SDValue, 16> &OutLoadChains,
SmallVector<SDValue, 16> &OutStoreChains) {
assert(OutLoadChains.size() && "Missing loads in memcpy inlining");
assert(OutStoreChains.size() && "Missing stores in memcpy inlining");
SmallVector<SDValue, 16> GluedLoadChains;
for (unsigned i = From; i < To; ++i) {
OutChains.push_back(OutLoadChains[i]);
GluedLoadChains.push_back(OutLoadChains[i]);
}
// Chain for all loads.
SDValue LoadToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
GluedLoadChains);
for (unsigned i = From; i < To; ++i) {
StoreSDNode *ST = dyn_cast<StoreSDNode>(OutStoreChains[i]);
SDValue NewStore = DAG.getTruncStore(LoadToken, dl, ST->getValue(),
ST->getBasePtr(), ST->getMemoryVT(),
ST->getMemOperand());
OutChains.push_back(NewStore);
}
}
static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
SDValue Chain, SDValue Dst, SDValue Src,
uint64_t Size, unsigned Align,
@ -5282,7 +5315,9 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
MachineMemOperand::Flags MMOFlags =
isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
SmallVector<SDValue, 8> OutChains;
SmallVector<SDValue, 16> OutLoadChains;
SmallVector<SDValue, 16> OutStoreChains;
SmallVector<SDValue, 32> OutChains;
unsigned NumMemOps = MemOps.size();
uint64_t SrcOff = 0, DstOff = 0;
for (unsigned i = 0; i != NumMemOps; ++i) {
@ -5316,11 +5351,13 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
SubSlice.Length = VTSize;
}
Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
if (Value.getNode())
if (Value.getNode()) {
Store = DAG.getStore(Chain, dl, Value,
DAG.getMemBasePlusOffset(Dst, DstOff, dl),
DstPtrInfo.getWithOffset(DstOff), Align,
MMOFlags);
OutChains.push_back(Store);
}
}
if (!Store.getNode()) {
@ -5342,17 +5379,61 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
DAG.getMemBasePlusOffset(Src, SrcOff, dl),
SrcPtrInfo.getWithOffset(SrcOff), VT,
MinAlign(SrcAlign, SrcOff), SrcMMOFlags);
OutChains.push_back(Value.getValue(1));
OutLoadChains.push_back(Value.getValue(1));
Store = DAG.getTruncStore(
Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags);
OutStoreChains.push_back(Store);
}
OutChains.push_back(Store);
SrcOff += VTSize;
DstOff += VTSize;
Size -= VTSize;
}
unsigned GluedLdStLimit = MaxLdStGlue == 0 ?
TLI.getMaxGluedStoresPerMemcpy() : MaxLdStGlue;
unsigned NumLdStInMemcpy = OutStoreChains.size();
if (NumLdStInMemcpy) {
// It may be that memcpy might be converted to memset if it's memcpy
// of constants. In such a case, we won't have loads and stores, but
// just stores. In the absence of loads, there is nothing to gang up.
if ((GluedLdStLimit <= 1) || !EnableMemCpyDAGOpt) {
// If target does not care, just leave as it.
for (unsigned i = 0; i < NumLdStInMemcpy; ++i) {
OutChains.push_back(OutLoadChains[i]);
OutChains.push_back(OutStoreChains[i]);
}
} else {
// Ld/St less than/equal limit set by target.
if (NumLdStInMemcpy <= GluedLdStLimit) {
chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
NumLdStInMemcpy, OutLoadChains,
OutStoreChains);
} else {
unsigned NumberLdChain = NumLdStInMemcpy / GluedLdStLimit;
unsigned RemainingLdStInMemcpy = NumLdStInMemcpy % GluedLdStLimit;
unsigned GlueIter = 0;
for (unsigned cnt = 0; cnt < NumberLdChain; ++cnt) {
unsigned IndexFrom = NumLdStInMemcpy - GlueIter - GluedLdStLimit;
unsigned IndexTo = NumLdStInMemcpy - GlueIter;
chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, IndexFrom, IndexTo,
OutLoadChains, OutStoreChains);
GlueIter += GluedLdStLimit;
}
// Residual ld/st.
if (RemainingLdStInMemcpy) {
chainLoadsAndStoresForMemcpy(DAG, dl, OutChains, 0,
RemainingLdStInMemcpy, OutLoadChains,
OutStoreChains);
}
}
}
}
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
}

View File

@ -535,6 +535,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
// Perform these initializations only once.
MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove =
MaxLoadsPerMemcmp = 8;
MaxGluedStoresPerMemcpy = 0;
MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
UseUnderscoreSetJmp = false;

View File

@ -580,6 +580,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::GlobalAddress);
MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
MaxGluedStoresPerMemcpy = 4;
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;

View File

@ -8,9 +8,9 @@
; CHECK: adrp x[[PAGE:[0-9]+]], {{l_b@PAGE|.Lb}}
; CHECK: add x[[ADDR:[0-9]+]], x[[PAGE]], {{l_b@PAGEOFF|:lo12:.Lb}}
; CHECK-NEXT: ldr [[VAL2:x[0-9]+]], [x[[ADDR]]]
; CHECK-NEXT: ldr [[VAL:w[0-9]+]], [x[[ADDR]], #8]
; CHECK-NEXT: str [[VAL]], [x0, #8]
; CHECK-NEXT: ldr [[VAL2:x[0-9]+]], [x[[ADDR]]]
; CHECK-NEXT: str [[VAL2]], [x0]
define void @foo(i8* %a) {

View File

@ -29,10 +29,10 @@ entry:
define void @t1(i8* nocapture %C) nounwind {
entry:
; CHECK-LABEL: t1:
; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
; CHECK: stur [[DEST]], [x0, #15]
; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
; CHECK: str [[DEST]], [x0]
; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
; CHECK: stur [[DEST:q[0-9]+]], [x0, #15]
; CHECK: str [[DEST:q[0-9]+]], [x0]
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str1, i64 0, i64 0), i64 31, i1 false)
ret void
}
@ -52,9 +52,9 @@ entry:
define void @t3(i8* nocapture %C) nounwind {
entry:
; CHECK-LABEL: t3:
; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
; CHECK: ldr [[REG4:x[0-9]+]], [x[[BASEREG:[0-9]+]], #16]
; CHECK: str [[REG4]], [x0, #16]
; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
; CHECK: str [[DEST]], [x0]
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str3, i64 0, i64 0), i64 24, i1 false)
ret void

View File

@ -130,12 +130,9 @@ define void @test_va_copy() {
; CHECK: add x[[SRC:[0-9]+]], {{x[0-9]+}}, :lo12:var
; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]]]
; CHECK: ldp [[BLOCK:q[0-9]+]], [[BLOCK:q[0-9]+]], [x[[SRC]]]
; CHECK: add x[[DST:[0-9]+]], {{x[0-9]+}}, :lo12:second_list
; CHECK: str [[BLOCK]], [x[[DST]]]
; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]], #16]
; CHECK: str [[BLOCK]], [x[[DST]], #16]
; CHECK: stp [[BLOCK:q[0-9]+]], [[BLOCK:q[0-9]+]], [x[[DST]]]
ret void
; CHECK: ret
}

View File

@ -34,8 +34,8 @@
define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) {
; CHECK: Precompute_Patch_Values
; CHECK: ldr [[VAL:x[0-9]+]], [x0, #288]
; CHECK-NEXT: str [[VAL]], [sp, #232]
; CHECK-NEXT: ldr [[VAL2:q[0-9]+]], [x0, #272]
; CHECK-NEXT: str [[VAL]], [sp, #232]
; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216]
entry:
%Control_Points = alloca [16 x [3 x double]], align 8

View File

@ -4,16 +4,14 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
target triple = "arm64-apple-ios10.0.0"
; PR33475 - Expect 64-bit operations as 128-operations are not legal
; However, we can generate a paired 64-bit loads and stores, without using
; floating point registers.
; CHECK-LABEL: pr33475
; CHECK-DAG: ldr [[R0:x[0-9]+]], [x1]
; CHECK-DAG: str [[R0]], [x0]
; CHECK-DAG: ldr [[R1:x[0-9]+]], [x1, #8]
; CHECK-DAG: str [[R1]], [x0, #8]
; CHECK-DAG: ldr [[R2:x[0-9]+]], [x1, #16]
; CHECK-DAG: str [[R2]], [x0, #16]
; CHECK-DAG: ldr [[R3:x[0-9]+]], [x1, #24]
; CHECK-DAG: str [[R3]], [x0, #24]
; CHECK-DAG: ldp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x1, #16]
; CHECK-DAG: ldp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x1]
; CHECK-DAG: stp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x0, #16]
; CHECK-DAG: stp [[R0:x[0-9]+]], [[R0:x[0-9]+]], [x0]
define void @pr33475(i8* %p0, i8* %p1) noimplicitfloat {
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %p0, i8* align 4 %p1, i64 32, i1 false)

View File

@ -44,15 +44,14 @@ entry:
define void @t2(i8* nocapture %C) nounwind {
entry:
; CHECK-LABEL: t2:
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]!
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
; CHECK: movs [[INC:r[0-9]+]], #32
; CHECK: add.w r3, r0, #16
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]]
; CHECK: movw [[REG2:r[0-9]+]], #16716
; CHECK: movt [[REG2:r[0-9]+]], #72
; CHECK: str [[REG2]], [r0]
; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r3]
; CHECK-T1-LABEL: t2:
; CHECK-T1: bl _memcpy
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str2, i64 0, i64 0), i64 36, i1 false)