mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 10:42:39 +01:00
[OpenMPOpt][WIP] Expand parallel region merging
The existing implementation of parallel region merging applies only to consecutive parallel regions that have speculatable sequential instructions in-between. This patch lifts this limitation to expand merging with any sequential instructions in-between, except calls to unmergable OpenMP runtime functions. In-between sequential instructions in the merged region are sequentialized in a "master" region and any output values are broadcasted to the following parallel regions and the sequential region continuation of the merged region. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D90909
This commit is contained in:
parent
cd887f1ace
commit
68536a3264
@ -38,7 +38,10 @@ public:
|
||||
void initialize();
|
||||
|
||||
/// Finalize the underlying module, e.g., by outlining regions.
|
||||
void finalize();
|
||||
/// \param AllowExtractorSinking Flag to include sinking instructions,
|
||||
/// emitted by CodeExtractor, in the
|
||||
/// outlined region. Default is false.
|
||||
void finalize(bool AllowExtractorSinking = false);
|
||||
|
||||
/// Add attributes known for \p FnID to \p Fn.
|
||||
void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
|
||||
|
@ -127,7 +127,7 @@ Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
|
||||
|
||||
void OpenMPIRBuilder::initialize() { initializeTypes(M); }
|
||||
|
||||
void OpenMPIRBuilder::finalize() {
|
||||
void OpenMPIRBuilder::finalize(bool AllowExtractorSinking) {
|
||||
SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
|
||||
SmallVector<BasicBlock *, 32> Blocks;
|
||||
for (OutlineInfo &OI : OutlineInfos) {
|
||||
@ -170,6 +170,25 @@ void OpenMPIRBuilder::finalize() {
|
||||
BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
|
||||
assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
|
||||
assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
|
||||
if (AllowExtractorSinking) {
|
||||
// Move instructions from the to-be-deleted ArtificialEntry to the entry
|
||||
// basic block of the parallel region. CodeExtractor may have sunk
|
||||
// allocas/bitcasts for values that are solely used in the outlined
|
||||
// region and do not escape.
|
||||
assert(!ArtificialEntry.empty() &&
|
||||
"Expected instructions to sink in the outlined region");
|
||||
for (BasicBlock::iterator It = ArtificialEntry.begin(),
|
||||
End = ArtificialEntry.end();
|
||||
It != End;) {
|
||||
Instruction &I = *It;
|
||||
It++;
|
||||
|
||||
if (I.isTerminator())
|
||||
continue;
|
||||
|
||||
I.moveBefore(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
|
||||
}
|
||||
}
|
||||
OI.EntryBB->moveBefore(&ArtificialEntry);
|
||||
ArtificialEntry.eraseFromParent();
|
||||
}
|
||||
|
@ -28,6 +28,7 @@
|
||||
#include "llvm/Transforms/IPO/Attributor.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
|
||||
#include "llvm/Transforms/Utils/CodeExtractor.h"
|
||||
|
||||
using namespace llvm;
|
||||
using namespace omp;
|
||||
@ -317,13 +318,17 @@ struct OMPInformationCache : public InformationCache {
|
||||
return NumUses;
|
||||
}
|
||||
|
||||
// Helper function to recollect uses of a runtime function.
|
||||
void recollectUsesForFunction(RuntimeFunction RTF) {
|
||||
auto &RFI = RFIs[RTF];
|
||||
RFI.clearUsesMap();
|
||||
collectUses(RFI, /*CollectStats*/ false);
|
||||
}
|
||||
|
||||
// Helper function to recollect uses of all runtime functions.
|
||||
void recollectUses() {
|
||||
for (int Idx = 0; Idx < RFIs.size(); ++Idx) {
|
||||
auto &RFI = RFIs[static_cast<RuntimeFunction>(Idx)];
|
||||
RFI.clearUsesMap();
|
||||
collectUses(RFI, /*CollectStats*/ false);
|
||||
}
|
||||
for (int Idx = 0; Idx < RFIs.size(); ++Idx)
|
||||
recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
|
||||
}
|
||||
|
||||
/// Helper to initialize all runtime function information for those defined
|
||||
@ -601,15 +606,11 @@ private:
|
||||
if (!RFI.Declaration)
|
||||
return false;
|
||||
|
||||
// Check if there any __kmpc_push_proc_bind calls for explicit affinities.
|
||||
OMPInformationCache::RuntimeFunctionInfo &ProcBindRFI =
|
||||
OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind];
|
||||
|
||||
// Defensively abort if explicit affinities are set.
|
||||
// TODO: Track ICV proc_bind to merge when mergable regions have the same
|
||||
// affinity.
|
||||
if (ProcBindRFI.Declaration)
|
||||
return false;
|
||||
// Unmergable calls that prevent merging a parallel region.
|
||||
OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
|
||||
OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
|
||||
OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
|
||||
};
|
||||
|
||||
bool Changed = false;
|
||||
LoopInfo *LI = nullptr;
|
||||
@ -637,6 +638,90 @@ private:
|
||||
|
||||
auto FiniCB = [&](InsertPointTy CodeGenIP) {};
|
||||
|
||||
/// Create a sequential execution region within a merged parallel region,
|
||||
/// encapsulated in a master construct with a barrier for synchronization.
|
||||
auto CreateSequentialRegion = [&](Function *OuterFn,
|
||||
BasicBlock *OuterPredBB,
|
||||
Instruction *SeqStartI,
|
||||
Instruction *SeqEndI) {
|
||||
// Isolate the instructions of the sequential region to a separate
|
||||
// block.
|
||||
BasicBlock *ParentBB = SeqStartI->getParent();
|
||||
BasicBlock *SeqEndBB =
|
||||
SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
|
||||
BasicBlock *SeqAfterBB =
|
||||
SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI);
|
||||
BasicBlock *SeqStartBB =
|
||||
SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");
|
||||
|
||||
assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&
|
||||
"Expected a different CFG");
|
||||
const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
|
||||
ParentBB->getTerminator()->eraseFromParent();
|
||||
|
||||
auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
|
||||
BasicBlock &ContinuationIP) {
|
||||
BasicBlock *CGStartBB = CodeGenIP.getBlock();
|
||||
BasicBlock *CGEndBB =
|
||||
SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
|
||||
assert(SeqStartBB != nullptr && "SeqStartBB should not be null");
|
||||
CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB);
|
||||
assert(SeqEndBB != nullptr && "SeqEndBB should not be null");
|
||||
SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB);
|
||||
};
|
||||
auto FiniCB = [&](InsertPointTy CodeGenIP) {};
|
||||
|
||||
// Find outputs from the sequential region to outside users and
|
||||
// broadcast their values to them.
|
||||
for (Instruction &I : *SeqStartBB) {
|
||||
SmallPtrSet<Instruction *, 4> OutsideUsers;
|
||||
for (User *Usr : I.users()) {
|
||||
Instruction &UsrI = *cast<Instruction>(Usr);
|
||||
// Ignore outputs to LT intrinsics, code extraction for the merged
|
||||
// parallel region will fix them.
|
||||
if (UsrI.isLifetimeStartOrEnd())
|
||||
continue;
|
||||
|
||||
if (UsrI.getParent() != SeqStartBB)
|
||||
OutsideUsers.insert(&UsrI);
|
||||
}
|
||||
|
||||
if (OutsideUsers.empty())
|
||||
continue;
|
||||
|
||||
// Emit an alloca in the outer region to store the broadcasted
|
||||
// value.
|
||||
const DataLayout &DL = M.getDataLayout();
|
||||
AllocaInst *AllocaI = new AllocaInst(
|
||||
I.getType(), DL.getAllocaAddrSpace(), nullptr,
|
||||
I.getName() + ".seq.output.alloc", &OuterFn->front().front());
|
||||
|
||||
// Emit a store instruction in the sequential BB to update the
|
||||
// value.
|
||||
new StoreInst(&I, AllocaI, SeqStartBB->getTerminator());
|
||||
|
||||
// Emit a load instruction and replace the use of the output value
|
||||
// with it.
|
||||
for (Instruction *UsrI : OutsideUsers) {
|
||||
LoadInst *LoadI = new LoadInst(I.getType(), AllocaI,
|
||||
I.getName() + ".seq.output.load", UsrI);
|
||||
UsrI->replaceUsesOfWith(&I, LoadI);
|
||||
}
|
||||
}
|
||||
|
||||
OpenMPIRBuilder::LocationDescription Loc(
|
||||
InsertPointTy(ParentBB, ParentBB->end()), DL);
|
||||
InsertPointTy SeqAfterIP =
|
||||
OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
|
||||
|
||||
OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
|
||||
|
||||
BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
|
||||
|
||||
LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
|
||||
<< "\n");
|
||||
};
|
||||
|
||||
// Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
|
||||
// contained in BB and only separated by instructions that can be
|
||||
// redundantly executed in parallel. The block BB is split before the first
|
||||
@ -682,6 +767,21 @@ private:
|
||||
const DebugLoc DL = BB->getTerminator()->getDebugLoc();
|
||||
BB->getTerminator()->eraseFromParent();
|
||||
|
||||
// Create sequential regions for sequential instructions that are
|
||||
// in-between mergable parallel regions.
|
||||
for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;
|
||||
It != End; ++It) {
|
||||
Instruction *ForkCI = *It;
|
||||
Instruction *NextForkCI = *(It + 1);
|
||||
|
||||
// Continue if there are not in-between instructions.
|
||||
if (ForkCI->getNextNode() == NextForkCI)
|
||||
continue;
|
||||
|
||||
CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
|
||||
NextForkCI->getPrevNode());
|
||||
}
|
||||
|
||||
OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
|
||||
DL);
|
||||
IRBuilder<>::InsertPoint AllocaIP(
|
||||
@ -695,7 +795,7 @@ private:
|
||||
BranchInst::Create(AfterBB, AfterIP.getBlock());
|
||||
|
||||
// Perform the actual outlining.
|
||||
OMPInfoCache.OMPBuilder.finalize();
|
||||
OMPInfoCache.OMPBuilder.finalize(/* AllowExtractorSinking */ true);
|
||||
|
||||
Function *OutlinedFn = MergableCIs.front()->getCaller();
|
||||
|
||||
@ -782,16 +882,75 @@ private:
|
||||
BasicBlock *BB = It.getFirst();
|
||||
SmallVector<CallInst *, 4> MergableCIs;
|
||||
|
||||
/// Returns true if the instruction is mergable, false otherwise.
|
||||
/// A terminator instruction is unmergable by definition since merging
|
||||
/// works within a BB. Instructions before the mergable region are
|
||||
/// mergable if they are not calls to OpenMP runtime functions that may
|
||||
/// set different execution parameters for subsequent parallel regions.
|
||||
/// Instructions in-between parallel regions are mergable if they are not
|
||||
/// calls to any non-intrinsic function since that may call a non-mergable
|
||||
/// OpenMP runtime function.
|
||||
auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
|
||||
// We do not merge across BBs, hence return false (unmergable) if the
|
||||
// instruction is a terminator.
|
||||
if (I.isTerminator())
|
||||
return false;
|
||||
|
||||
if (!isa<CallInst>(&I))
|
||||
return true;
|
||||
|
||||
CallInst *CI = cast<CallInst>(&I);
|
||||
if (IsBeforeMergableRegion) {
|
||||
Function *CalledFunction = CI->getCalledFunction();
|
||||
if (!CalledFunction)
|
||||
return false;
|
||||
// Return false (unmergable) if the call before the parallel
|
||||
// region calls an explicit affinity (proc_bind) or number of
|
||||
// threads (num_threads) compiler-generated function. Those settings
|
||||
// may be incompatible with following parallel regions.
|
||||
// TODO: ICV tracking to detect compatibility.
|
||||
for (const auto &RFI : UnmergableCallsInfo) {
|
||||
if (CalledFunction == RFI.Declaration)
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
// Return false (unmergable) if there is a call instruction
|
||||
// in-between parallel regions when it is not an intrinsic. It
|
||||
// may call an unmergable OpenMP runtime function in its callpath.
|
||||
// TODO: Keep track of possible OpenMP calls in the callpath.
|
||||
if (!isa<IntrinsicInst>(CI))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
};
|
||||
// Find maximal number of parallel region CIs that are safe to merge.
|
||||
for (Instruction &I : *BB) {
|
||||
for (auto It = BB->begin(), End = BB->end(); It != End;) {
|
||||
Instruction &I = *It;
|
||||
++It;
|
||||
|
||||
if (CIs.count(&I)) {
|
||||
MergableCIs.push_back(cast<CallInst>(&I));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (isSafeToSpeculativelyExecute(&I, &I, DT))
|
||||
// Continue expanding if the instruction is mergable.
|
||||
if (IsMergable(I, MergableCIs.empty()))
|
||||
continue;
|
||||
|
||||
// Forward the instruction iterator to skip the next parallel region
|
||||
// since there is an unmergable instruction which can affect it.
|
||||
for (; It != End; ++It) {
|
||||
Instruction &SkipI = *It;
|
||||
if (CIs.count(&SkipI)) {
|
||||
LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI
|
||||
<< " due to " << I << "\n");
|
||||
++It;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Store mergable regions found.
|
||||
if (MergableCIs.size() > 1) {
|
||||
MergableCIsVector.push_back(MergableCIs);
|
||||
LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
|
||||
@ -812,15 +971,12 @@ private:
|
||||
}
|
||||
|
||||
if (Changed) {
|
||||
// Update RFI info to set it up for later passes.
|
||||
RFI.clearUsesMap();
|
||||
OMPInfoCache.collectUses(RFI, /* CollectStats */ false);
|
||||
|
||||
// Collect uses for the emitted barrier call.
|
||||
OMPInformationCache::RuntimeFunctionInfo &BarrierRFI =
|
||||
OMPInfoCache.RFIs[OMPRTL___kmpc_barrier];
|
||||
BarrierRFI.clearUsesMap();
|
||||
OMPInfoCache.collectUses(BarrierRFI, /* CollectStats */ false);
|
||||
/// Re-collect use for fork calls, emitted barrier calls, and
|
||||
/// any emitted master/end_master calls.
|
||||
OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
|
||||
OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
|
||||
OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
|
||||
OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
|
||||
}
|
||||
|
||||
return Changed;
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user