mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-24 11:42:57 +01:00
Implement initial support for PHI translation in memdep. This means that
memdep keeps track of how PHIs affect the pointer in dep queries, which allows it to eliminate the load in cases like rle-phi-translate.ll, which basically end up being: BB1: X = load P br BB3 BB2: Y = load Q br BB3 BB3: R = phi [P] [Q] load R turning "load R" into a phi of X/Y. In addition to additional exposed opportunities, this makes memdep safe in many cases that it wasn't before (which is required for load PRE) and also makes it substantially more efficient. For example, consider: bb1: // has many predecessors. P = some_operator() load P In this example, previously memdep would scan all the predecessors of BB1 to see if they had something that would mustalias P. In some cases (e.g. test/Transforms/GVN/rle-must-alias.ll) it would actually find them and end up eliminating something. In many other cases though, it would scan and not find anything useful. MemDep now stops at a block if the pointer is defined in that block and cannot be phi translated to predecessors. This causes it to miss the (rare) cases like rle-must-alias.ll, but makes it faster by not scanning tons of stuff that is unlikely to be useful. For example, this speeds up GVN as a whole from 3.928s to 2.448s (60%)!. IMO, scalar GVN should be enhanced to simplify the rle-must-alias pointer base anyway, which would allow the loads to be eliminated. In the future, this should be enhanced to phi translate through geps and bitcasts as well (as indicated by FIXMEs) making memdep even more powerful. llvm-svn: 61022
This commit is contained in:
parent
c2d2c0bdf3
commit
c92b131639
@ -156,12 +156,18 @@ namespace llvm {
|
||||
/// ValueIsLoadPair - This is a pair<Value*, bool> where the bool is true if
|
||||
/// the dependence is a read only dependence, false if read/write.
|
||||
typedef PointerIntPair<Value*, 1, bool> ValueIsLoadPair;
|
||||
|
||||
|
||||
/// BBSkipFirstBlockPair - This pair is used when caching information for a
|
||||
/// block. If the pointer is null, the cache value is not a full query that
|
||||
/// starts at the specified block. If non-null, the bool indicates whether
|
||||
/// or not the contents of the block was skipped.
|
||||
typedef PointerIntPair<BasicBlock*, 1, bool> BBSkipFirstBlockPair;
|
||||
|
||||
/// CachedNonLocalPointerInfo - This map stores the cached results of doing
|
||||
/// a pointer lookup at the bottom of a block. The key of this map is the
|
||||
/// pointer+isload bit, the value is a list of <bb->result> mappings.
|
||||
typedef DenseMap<ValueIsLoadPair,
|
||||
std::pair<BasicBlock*, NonLocalDepInfo> > CachedNonLocalPointerInfo;
|
||||
typedef DenseMap<ValueIsLoadPair, std::pair<BBSkipFirstBlockPair,
|
||||
NonLocalDepInfo> > CachedNonLocalPointerInfo;
|
||||
CachedNonLocalPointerInfo NonLocalPointerDeps;
|
||||
|
||||
// A map from instructions to their non-local pointer dependencies.
|
||||
@ -259,10 +265,11 @@ namespace llvm {
|
||||
MemDepResult getCallSiteDependencyFrom(CallSite C, bool isReadOnlyCall,
|
||||
BasicBlock::iterator ScanIt,
|
||||
BasicBlock *BB);
|
||||
void getNonLocalPointerDepFromBB(Value *Pointer, uint64_t Size,
|
||||
bool getNonLocalPointerDepFromBB(Value *Pointer, uint64_t Size,
|
||||
bool isLoad, BasicBlock *BB,
|
||||
SmallVectorImpl<NonLocalDepEntry> &Result,
|
||||
SmallPtrSet<BasicBlock*, 64> &Visited);
|
||||
DenseMap<BasicBlock*, Value*> &Visited,
|
||||
bool SkipFirstBlock = false);
|
||||
MemDepResult GetNonLocalInfoForBlock(Value *Pointer, uint64_t PointeeSize,
|
||||
bool isLoad, BasicBlock *BB,
|
||||
NonLocalDepInfo *Cache,
|
||||
|
@ -382,8 +382,7 @@ MemoryDependenceAnalysis::getNonLocalCallDependency(CallSite QueryCS) {
|
||||
|
||||
// isReadonlyCall - If this is a read-only call, we can be more aggressive.
|
||||
bool isReadonlyCall = AA->onlyReadsMemory(QueryCS);
|
||||
|
||||
// Visited checked first, vector in sorted order.
|
||||
|
||||
SmallPtrSet<BasicBlock*, 64> Visited;
|
||||
|
||||
unsigned NumSortedEntries = Cache.size();
|
||||
@ -487,10 +486,16 @@ getNonLocalPointerDependency(Value *Pointer, bool isLoad, BasicBlock *FromBB,
|
||||
const Type *EltTy = cast<PointerType>(Pointer->getType())->getElementType();
|
||||
uint64_t PointeeSize = TD->getTypeStoreSize(EltTy);
|
||||
|
||||
// While we have blocks to analyze, get their values.
|
||||
SmallPtrSet<BasicBlock*, 64> Visited;
|
||||
getNonLocalPointerDepFromBB(Pointer, PointeeSize, isLoad, FromBB,
|
||||
Result, Visited);
|
||||
// This is the set of blocks we've inspected, and the pointer we consider in
|
||||
// each block. Because of critical edges, we currently bail out if querying
|
||||
// a block with multiple different pointers. This can happen during PHI
|
||||
// translation.
|
||||
DenseMap<BasicBlock*, Value*> Visited;
|
||||
if (!getNonLocalPointerDepFromBB(Pointer, PointeeSize, isLoad, FromBB,
|
||||
Result, Visited, true))
|
||||
return;
|
||||
Result.push_back(std::make_pair(FromBB,
|
||||
MemDepResult::getClobber(FromBB->begin())));
|
||||
}
|
||||
|
||||
/// GetNonLocalInfoForBlock - Compute the memdep value for BB with
|
||||
@ -566,35 +571,51 @@ GetNonLocalInfoForBlock(Value *Pointer, uint64_t PointeeSize,
|
||||
}
|
||||
|
||||
|
||||
/// getNonLocalPointerDepFromBB -
|
||||
void MemoryDependenceAnalysis::
|
||||
/// getNonLocalPointerDepFromBB - Perform a dependency query based on
|
||||
/// pointer/pointeesize starting at the end of StartBB. Add any clobber/def
|
||||
/// results to the results vector and keep track of which blocks are visited in
|
||||
/// 'Visited'.
|
||||
///
|
||||
/// This has special behavior for the first block queries (when SkipFirstBlock
|
||||
/// is true). In this special case, it ignores the contents of the specified
|
||||
/// block and starts returning dependence info for its predecessors.
|
||||
///
|
||||
/// This function returns false on success, or true to indicate that it could
|
||||
/// not compute dependence information for some reason. This should be treated
|
||||
/// as a clobber dependence on the first instruction in the predecessor block.
|
||||
bool MemoryDependenceAnalysis::
|
||||
getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
|
||||
bool isLoad, BasicBlock *StartBB,
|
||||
SmallVectorImpl<NonLocalDepEntry> &Result,
|
||||
SmallPtrSet<BasicBlock*, 64> &Visited) {
|
||||
DenseMap<BasicBlock*, Value*> &Visited,
|
||||
bool SkipFirstBlock) {
|
||||
|
||||
// Look up the cached info for Pointer.
|
||||
ValueIsLoadPair CacheKey(Pointer, isLoad);
|
||||
|
||||
std::pair<BasicBlock*, NonLocalDepInfo> &CacheInfo =
|
||||
NonLocalPointerDeps[CacheKey];
|
||||
NonLocalDepInfo *Cache = &CacheInfo.second;
|
||||
std::pair<BBSkipFirstBlockPair, NonLocalDepInfo> *CacheInfo =
|
||||
&NonLocalPointerDeps[CacheKey];
|
||||
NonLocalDepInfo *Cache = &CacheInfo->second;
|
||||
|
||||
// If we have valid cached information for exactly the block we are
|
||||
// investigating, just return it with no recomputation.
|
||||
if (CacheInfo.first == StartBB) {
|
||||
if (CacheInfo->first == BBSkipFirstBlockPair(StartBB, SkipFirstBlock)) {
|
||||
for (NonLocalDepInfo::iterator I = Cache->begin(), E = Cache->end();
|
||||
I != E; ++I)
|
||||
if (!I->second.isNonLocal())
|
||||
Result.push_back(*I);
|
||||
++NumCacheCompleteNonLocalPtr;
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Otherwise, either this is a new block, a block with an invalid cache
|
||||
// pointer or one that we're about to invalidate by putting more info into it
|
||||
// than its valid cache info. If empty, the result will be valid cache info,
|
||||
// otherwise it isn't.
|
||||
CacheInfo.first = Cache->empty() ? StartBB : 0;
|
||||
if (Cache->empty())
|
||||
CacheInfo->first = BBSkipFirstBlockPair(StartBB, SkipFirstBlock);
|
||||
else
|
||||
CacheInfo->first = BBSkipFirstBlockPair();
|
||||
|
||||
SmallVector<BasicBlock*, 32> Worklist;
|
||||
Worklist.push_back(StartBB);
|
||||
@ -606,23 +627,14 @@ getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
|
||||
// revisit blocks after we insert info for them.
|
||||
unsigned NumSortedEntries = Cache->size();
|
||||
|
||||
// SkipFirstBlock - If this is the very first block that we're processing, we
|
||||
// don't want to scan or think about its body, because the client was supposed
|
||||
// to do a local dependence query. Instead, just start processing it by
|
||||
// adding its predecessors to the worklist and iterating.
|
||||
bool SkipFirstBlock = Visited.empty();
|
||||
|
||||
while (!Worklist.empty()) {
|
||||
BasicBlock *BB = Worklist.pop_back_val();
|
||||
|
||||
// Skip the first block if we have it.
|
||||
if (SkipFirstBlock) {
|
||||
SkipFirstBlock = false;
|
||||
} else {
|
||||
if (!SkipFirstBlock) {
|
||||
// Analyze the dependency of *Pointer in FromBB. See if we already have
|
||||
// been here.
|
||||
if (!Visited.insert(BB))
|
||||
continue;
|
||||
assert(Visited.count(BB) && "Should check 'visited' before adding to WL");
|
||||
|
||||
// Get the dependency info for Pointer in BB. If we have cached
|
||||
// information, we will use it, otherwise we compute it.
|
||||
@ -636,11 +648,123 @@ getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
|
||||
}
|
||||
}
|
||||
|
||||
// Otherwise, we have to process all the predecessors of this block to scan
|
||||
// them as well.
|
||||
for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) {
|
||||
// TODO: PHI TRANSLATE.
|
||||
Worklist.push_back(*PI);
|
||||
// If 'Pointer' is an instruction defined in this block, then we need to do
|
||||
// phi translation to change it into a value live in the predecessor block.
|
||||
// If phi translation fails, then we can't continue dependence analysis.
|
||||
Instruction *PtrInst = dyn_cast<Instruction>(Pointer);
|
||||
bool NeedsPHITranslation = PtrInst && PtrInst->getParent() == BB;
|
||||
|
||||
// If no PHI translation is needed, just add all the predecessors of this
|
||||
// block to scan them as well.
|
||||
if (!NeedsPHITranslation) {
|
||||
SkipFirstBlock = false;
|
||||
for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI) {
|
||||
// Verify that we haven't looked at this block yet.
|
||||
std::pair<DenseMap<BasicBlock*,Value*>::iterator, bool>
|
||||
InsertRes = Visited.insert(std::make_pair(*PI, Pointer));
|
||||
if (InsertRes.second) {
|
||||
// First time we've looked at *PI.
|
||||
Worklist.push_back(*PI);
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we have seen this block before, but it was with a different
|
||||
// pointer then we have a phi translation failure and we have to treat
|
||||
// this as a clobber.
|
||||
if (InsertRes.first->second != Pointer)
|
||||
goto PredTranslationFailure;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// If we do need to do phi translation, then there are a bunch of different
|
||||
// cases, because we have to find a Value* live in the predecessor block. We
|
||||
// know that PtrInst is defined in this block at least.
|
||||
|
||||
// If this is directly a PHI node, just use the incoming values for each
|
||||
// pred as the phi translated version.
|
||||
if (PHINode *PtrPHI = dyn_cast<PHINode>(PtrInst)) {
|
||||
for (BasicBlock **PI = PredCache->GetPreds(BB); *PI; ++PI){
|
||||
BasicBlock *Pred = *PI;
|
||||
Value *PredPtr = PtrPHI->getIncomingValueForBlock(Pred);
|
||||
|
||||
// Check to see if we have already visited this pred block with another
|
||||
// pointer. If so, we can't do this lookup. This failure can occur
|
||||
// with PHI translation when a critical edge exists and the PHI node in
|
||||
// the successor translates to a pointer value different than the
|
||||
// pointer the block was first analyzed with.
|
||||
std::pair<DenseMap<BasicBlock*,Value*>::iterator, bool>
|
||||
InsertRes = Visited.insert(std::make_pair(Pred, PredPtr));
|
||||
|
||||
if (!InsertRes.second) {
|
||||
// If the predecessor was visited with PredPtr, then we already did
|
||||
// the analysis and can ignore it.
|
||||
if (InsertRes.first->second == PredPtr)
|
||||
continue;
|
||||
|
||||
// Otherwise, the block was previously analyzed with a different
|
||||
// pointer. We can't represent the result of this case, so we just
|
||||
// treat this as a phi translation failure.
|
||||
goto PredTranslationFailure;
|
||||
}
|
||||
|
||||
// If we have a problem phi translating, fall through to the code below
|
||||
// to handle the failure condition.
|
||||
if (getNonLocalPointerDepFromBB(PredPtr, PointeeSize, isLoad, Pred,
|
||||
Result, Visited))
|
||||
goto PredTranslationFailure;
|
||||
}
|
||||
|
||||
// Refresh the CacheInfo/Cache pointer so that it isn't invalidated.
|
||||
CacheInfo = &NonLocalPointerDeps[CacheKey];
|
||||
Cache = &CacheInfo->second;
|
||||
|
||||
// Since we did phi translation, the "Cache" set won't contain all of the
|
||||
// results for the query. This is ok (we can still use it to accelerate
|
||||
// specific block queries) but we can't do the fastpath "return all
|
||||
// results from the set" Clear out the indicator for this.
|
||||
CacheInfo->first = BBSkipFirstBlockPair();
|
||||
SkipFirstBlock = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: BITCAST, GEP.
|
||||
|
||||
// cerr << "MEMDEP: Could not PHI translate: " << *Pointer;
|
||||
// if (isa<BitCastInst>(PtrInst) || isa<GetElementPtrInst>(PtrInst))
|
||||
// cerr << "OP:\t\t\t\t" << *PtrInst->getOperand(0);
|
||||
PredTranslationFailure:
|
||||
|
||||
// Refresh the CacheInfo/Cache pointer so that it isn't invalidated.
|
||||
CacheInfo = &NonLocalPointerDeps[CacheKey];
|
||||
Cache = &CacheInfo->second;
|
||||
|
||||
// Since we did phi translation, the "Cache" set won't contain all of the
|
||||
// results for the query. This is ok (we can still use it to accelerate
|
||||
// specific block queries) but we can't do the fastpath "return all
|
||||
// results from the set" Clear out the indicator for this.
|
||||
CacheInfo->first = BBSkipFirstBlockPair();
|
||||
|
||||
// If *nothing* works, mark the pointer as being clobbered by the first
|
||||
// instruction in this block.
|
||||
//
|
||||
// If this is the magic first block, return this as a clobber of the whole
|
||||
// incoming value. Since we can't phi translate to one of the predecessors,
|
||||
// we have to bail out.
|
||||
if (SkipFirstBlock)
|
||||
return true;
|
||||
|
||||
for (NonLocalDepInfo::reverse_iterator I = Cache->rbegin(); ; ++I) {
|
||||
assert(I != Cache->rend() && "Didn't find current block??");
|
||||
if (I->first != BB)
|
||||
continue;
|
||||
|
||||
assert(I->second.isNonLocal() &&
|
||||
"Should only be here with transparent block");
|
||||
I->second = MemDepResult::getClobber(BB->begin());
|
||||
ReverseNonLocalPtrDeps[BB->begin()].insert(CacheKey.getOpaqueValue());
|
||||
Result.push_back(*I);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -658,19 +782,22 @@ getNonLocalPointerDepFromBB(Value *Pointer, uint64_t PointeeSize,
|
||||
Cache->insert(Entry, Val);
|
||||
// FALL THROUGH.
|
||||
}
|
||||
case 1: {
|
||||
case 1:
|
||||
// One new entry, Just insert the new value at the appropriate position.
|
||||
NonLocalDepEntry Val = Cache->back();
|
||||
Cache->pop_back();
|
||||
NonLocalDepInfo::iterator Entry =
|
||||
std::upper_bound(Cache->begin(), Cache->end(), Val);
|
||||
Cache->insert(Entry, Val);
|
||||
if (Cache->size() != 1) {
|
||||
NonLocalDepEntry Val = Cache->back();
|
||||
Cache->pop_back();
|
||||
NonLocalDepInfo::iterator Entry =
|
||||
std::upper_bound(Cache->begin(), Cache->end(), Val);
|
||||
Cache->insert(Entry, Val);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
// Added many values, do a full scale sort.
|
||||
std::sort(Cache->begin(), Cache->end());
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/// RemoveCachedNonLocalPointerDependencies - If P exists in
|
||||
@ -851,7 +978,7 @@ void MemoryDependenceAnalysis::removeInstruction(Instruction *RemInst) {
|
||||
NonLocalDepInfo &NLPDI = NonLocalPointerDeps[P].second;
|
||||
|
||||
// The cache is not valid for any specific block anymore.
|
||||
NonLocalPointerDeps[P].first = 0;
|
||||
NonLocalPointerDeps[P].first = BBSkipFirstBlockPair();
|
||||
|
||||
// Update any entries for RemInst to use the instruction after it.
|
||||
for (NonLocalDepInfo::iterator DI = NLPDI.begin(), DE = NLPDI.end();
|
||||
|
@ -1,4 +1,9 @@
|
||||
; RUN: llvm-as < %s | opt -gvn | llvm-dis | grep {DEAD.rle = phi i32}
|
||||
; XFAIL: *
|
||||
|
||||
; FIXME: GVN should eliminate the fully redundant %9 GEP which
|
||||
; allows DEAD to be removed. This is PR3198.
|
||||
|
||||
; The %7 and %4 loads combine to make %DEAD unneeded.
|
||||
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
|
||||
target triple = "i386-apple-darwin7"
|
||||
|
@ -1,4 +1,7 @@
|
||||
; RUN: llvm-as < %s | opt -gvn | llvm-dis | grep load
|
||||
; FIXME: This should be promotable, but memdep/gvn don't track values
|
||||
; path/edge sensitively enough.
|
||||
|
||||
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
|
||||
target triple = "i386-apple-darwin7"
|
||||
|
||||
|
32
test/Transforms/GVN/rle-phi-translate.ll
Normal file
32
test/Transforms/GVN/rle-phi-translate.ll
Normal file
@ -0,0 +1,32 @@
|
||||
; RUN: llvm-as < %s | opt -gvn | llvm-dis | grep {%cv.rle = phi i32}
|
||||
; RUN: llvm-as < %s | opt -gvn | llvm-dis | grep {%bv.rle = phi i32}
|
||||
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
|
||||
target triple = "i386-apple-darwin7"
|
||||
|
||||
define i32 @g(i32* %b, i32* %c) nounwind {
|
||||
entry:
|
||||
%g = alloca i32 ; <i32*> [#uses=4]
|
||||
%t1 = icmp eq i32* %b, null ; <i1> [#uses=1]
|
||||
br i1 %t1, label %bb, label %bb1
|
||||
|
||||
bb: ; preds = %entry
|
||||
%t2 = load i32* %c, align 4 ; <i32> [#uses=1]
|
||||
%t3 = add i32 %t2, 1 ; <i32> [#uses=1]
|
||||
store i32 %t3, i32* %g, align 4
|
||||
br label %bb2
|
||||
|
||||
bb1: ; preds = %entry
|
||||
%t5 = load i32* %b, align 4 ; <i32> [#uses=1]
|
||||
%t6 = add i32 %t5, 1 ; <i32> [#uses=1]
|
||||
store i32 %t6, i32* %g, align 4
|
||||
br label %bb2
|
||||
|
||||
bb2: ; preds = %bb1, %bb
|
||||
%c_addr.0 = phi i32* [ %g, %bb1 ], [ %c, %bb ] ; <i32*> [#uses=1]
|
||||
%b_addr.0 = phi i32* [ %b, %bb1 ], [ %g, %bb ] ; <i32*> [#uses=1]
|
||||
%cv = load i32* %c_addr.0, align 4 ; <i32> [#uses=1]
|
||||
%bv = load i32* %b_addr.0, align 4 ; <i32> [#uses=1]
|
||||
%ret = add i32 %cv, %bv ; <i32> [#uses=1]
|
||||
ret i32 %ret
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user