mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[LoopDataPrefetch/Aarch64] Allow selective prefetching of large-strided accesses
Summary: And use this TTI for Cyclone. As it was explained in the original RFC (http://thread.gmane.org/gmane.comp.compilers.llvm.devel/92758), the HW prefetcher work up to 2KB strides. I am also adding tests for this and the previous change (D17943): * Cyclone prefetching accesses with a large stride * Cyclone not prefetching accesses with a small stride * Generic Aarch64 subtarget not prefetching either Reviewers: hfinkel Subscribers: aemerson, rengolin, llvm-commits, mzolotukhin Differential Revision: http://reviews.llvm.org/D17945 llvm-svn: 263771
This commit is contained in:
parent
504bc5d49e
commit
8d7ad736df
@ -423,6 +423,11 @@ public:
|
||||
/// This is currently measured in number of instructions.
|
||||
unsigned getPrefetchDistance() const;
|
||||
|
||||
/// \return Some HW prefetchers can handle accesses up to a certain constant
|
||||
/// stride. This is the minimum stride in bytes where it makes sense to start
|
||||
/// adding SW prefetches. The default is 1, i.e. prefetch with any stride.
|
||||
unsigned getMinPrefetchStride() const;
|
||||
|
||||
/// \return The maximum interleave factor that any transform should try to
|
||||
/// perform for this target. This number depends on the level of parallelism
|
||||
/// and the number of execution units in the CPU.
|
||||
@ -618,6 +623,7 @@ public:
|
||||
virtual unsigned getRegisterBitWidth(bool Vector) = 0;
|
||||
virtual unsigned getCacheLineSize() = 0;
|
||||
virtual unsigned getPrefetchDistance() = 0;
|
||||
virtual unsigned getMinPrefetchStride() = 0;
|
||||
virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0;
|
||||
virtual unsigned
|
||||
getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
|
||||
@ -788,6 +794,9 @@ public:
|
||||
return Impl.getCacheLineSize();
|
||||
}
|
||||
unsigned getPrefetchDistance() override { return Impl.getPrefetchDistance(); }
|
||||
unsigned getMinPrefetchStride() override {
|
||||
return Impl.getMinPrefetchStride();
|
||||
}
|
||||
unsigned getMaxInterleaveFactor(unsigned VF) override {
|
||||
return Impl.getMaxInterleaveFactor(VF);
|
||||
}
|
||||
|
@ -268,6 +268,8 @@ public:
|
||||
|
||||
unsigned getPrefetchDistance() { return 0; }
|
||||
|
||||
unsigned getMinPrefetchStride() { return 1; }
|
||||
|
||||
unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
|
||||
|
||||
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
|
||||
|
@ -223,6 +223,10 @@ unsigned TargetTransformInfo::getPrefetchDistance() const {
|
||||
return TTIImpl->getPrefetchDistance();
|
||||
}
|
||||
|
||||
unsigned TargetTransformInfo::getMinPrefetchStride() const {
|
||||
return TTIImpl->getMinPrefetchStride();
|
||||
}
|
||||
|
||||
unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const {
|
||||
return TTIImpl->getMaxInterleaveFactor(VF);
|
||||
}
|
||||
|
@ -25,6 +25,12 @@ static cl::opt<unsigned> CyclonePrefetchDistance(
|
||||
cl::desc("Number of instructions to prefetch ahead for Cyclone"),
|
||||
cl::init(280), cl::Hidden);
|
||||
|
||||
// The HW prefetcher handles accesses with strides up to 2KB.
|
||||
static cl::opt<unsigned> CycloneMinPrefetchStride(
|
||||
"cyclone-min-prefetch-stride",
|
||||
cl::desc("Min stride to add prefetches for Cyclone"),
|
||||
cl::init(2048), cl::Hidden);
|
||||
|
||||
/// \brief Calculate the cost of materializing a 64-bit value. This helper
|
||||
/// method might only calculate a fraction of a larger immediate. Therefore it
|
||||
/// is valid to return a cost of ZERO.
|
||||
@ -590,3 +596,9 @@ unsigned AArch64TTIImpl::getPrefetchDistance() {
|
||||
return CyclonePrefetchDistance;
|
||||
return BaseT::getPrefetchDistance();
|
||||
}
|
||||
|
||||
unsigned AArch64TTIImpl::getMinPrefetchStride() {
|
||||
if (ST->isCyclone())
|
||||
return CycloneMinPrefetchStride;
|
||||
return BaseT::getMinPrefetchStride();
|
||||
}
|
||||
|
@ -131,6 +131,8 @@ public:
|
||||
unsigned getCacheLineSize();
|
||||
|
||||
unsigned getPrefetchDistance();
|
||||
|
||||
unsigned getMinPrefetchStride();
|
||||
/// @}
|
||||
};
|
||||
|
||||
|
@ -73,6 +73,10 @@ namespace {
|
||||
bool runOnFunction(Function &F) override;
|
||||
bool runOnLoop(Loop *L);
|
||||
|
||||
/// \brief Check if the the stride of the accesses is large enough to
|
||||
/// warrant a prefetch.
|
||||
bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
|
||||
|
||||
private:
|
||||
AssumptionCache *AC;
|
||||
LoopInfo *LI;
|
||||
@ -94,6 +98,22 @@ INITIALIZE_PASS_END(LoopDataPrefetch, "loop-data-prefetch",
|
||||
|
||||
FunctionPass *llvm::createLoopDataPrefetchPass() { return new LoopDataPrefetch(); }
|
||||
|
||||
bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) {
|
||||
unsigned TargetMinStride = TTI->getMinPrefetchStride();
|
||||
// No need to check if any stride goes.
|
||||
if (TargetMinStride <= 1)
|
||||
return true;
|
||||
|
||||
const auto *ConstStride = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
|
||||
// If MinStride is set, don't prefetch unless we can ensure that stride is
|
||||
// larger.
|
||||
if (!ConstStride)
|
||||
return false;
|
||||
|
||||
unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue());
|
||||
return TargetMinStride <= AbsStride;
|
||||
}
|
||||
|
||||
bool LoopDataPrefetch::runOnFunction(Function &F) {
|
||||
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
||||
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
|
||||
@ -184,6 +204,11 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
|
||||
if (!LSCEVAddRec)
|
||||
continue;
|
||||
|
||||
// Check if the the stride of the accesses is large enough to warrant a
|
||||
// prefetch.
|
||||
if (!isStrideLargeEnough(LSCEVAddRec))
|
||||
continue;
|
||||
|
||||
// We don't want to double prefetch individual cache lines. If this load
|
||||
// is known to be within one cache line of some other load that has
|
||||
// already been prefetched, then don't prefetch this one as well.
|
||||
|
51
test/Transforms/LoopDataPrefetch/AArch64/large-stride.ll
Normal file
51
test/Transforms/LoopDataPrefetch/AArch64/large-stride.ll
Normal file
@ -0,0 +1,51 @@
|
||||
; RUN: opt -mcpu=cyclone -mtriple=arm64-apple-ios -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=LARGE_PREFETCH --check-prefix=ALL
|
||||
; RUN: opt -mcpu=generic -mtriple=arm64-apple-ios -loop-data-prefetch -S < %s | FileCheck %s --check-prefix=NO_LARGE_PREFETCH --check-prefix=ALL
|
||||
|
||||
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
|
||||
|
||||
; ALL-LABEL: @small_stride(
|
||||
define void @small_stride(double* nocapture %a, double* nocapture readonly %b) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
; ALL: for.body:
|
||||
for.body: ; preds = %for.body, %entry
|
||||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
||||
%arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
|
||||
; ALL-NOT: call void @llvm.prefetch
|
||||
%0 = load double, double* %arrayidx, align 8
|
||||
%add = fadd double %0, 1.000000e+00
|
||||
%arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
|
||||
store double %add, double* %arrayidx2, align 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond = icmp eq i64 %indvars.iv.next, 1600
|
||||
br i1 %exitcond, label %for.end, label %for.body
|
||||
|
||||
; ALL: for.end:
|
||||
for.end: ; preds = %for.body
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @large_stride(
|
||||
define void @large_stride(double* nocapture %a, double* nocapture readonly %b) {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
; ALL: for.body:
|
||||
for.body: ; preds = %for.body, %entry
|
||||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
||||
%arrayidx = getelementptr inbounds double, double* %b, i64 %indvars.iv
|
||||
; LARGE_PREFETCH: call void @llvm.prefetch
|
||||
; NO_LARGE_PREFETCH-NOT: call void @llvm.prefetch
|
||||
%0 = load double, double* %arrayidx, align 8
|
||||
%add = fadd double %0, 1.000000e+00
|
||||
%arrayidx2 = getelementptr inbounds double, double* %a, i64 %indvars.iv
|
||||
store double %add, double* %arrayidx2, align 8
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 300
|
||||
%exitcond = icmp eq i64 %indvars.iv.next, 160000
|
||||
br i1 %exitcond, label %for.end, label %for.body
|
||||
|
||||
; ALL: for.end:
|
||||
for.end: ; preds = %for.body
|
||||
ret void
|
||||
}
|
4
test/Transforms/LoopDataPrefetch/AArch64/lit.local.cfg
Normal file
4
test/Transforms/LoopDataPrefetch/AArch64/lit.local.cfg
Normal file
@ -0,0 +1,4 @@
|
||||
config.suffixes = ['.ll']
|
||||
|
||||
if not 'AArch64' in config.root.targets:
|
||||
config.unsupported = True
|
Loading…
x
Reference in New Issue
Block a user