From 2dd5f52ad66a9c0123c54cc01806a95e3758fdd9 Mon Sep 17 00:00:00 2001 From: Mats Petersson Date: Fri, 16 Apr 2021 15:08:56 +0100 Subject: [PATCH] [OpenMP IRBuilder, MLIR] Add support for OpenMP do schedule dynamic The implementation supports static schedule for Fortran do loops. This implements the dynamic variant of the same concept. Reviewed By: Meinersbur Differential Revision: https://reviews.llvm.org/D97393 --- include/llvm/Frontend/OpenMP/OMPConstants.h | 11 ++ include/llvm/Frontend/OpenMP/OMPIRBuilder.h | 26 +++- lib/Frontend/OpenMP/OMPIRBuilder.cpp | 150 +++++++++++++++++++- unittests/Frontend/OpenMPIRBuilderTest.cpp | 99 +++++++++++++ 4 files changed, 280 insertions(+), 6 deletions(-) diff --git a/include/llvm/Frontend/OpenMP/OMPConstants.h b/include/llvm/Frontend/OpenMP/OMPConstants.h index 4c67ea332aa..0e0cb0189f5 100644 --- a/include/llvm/Frontend/OpenMP/OMPConstants.h +++ b/include/llvm/Frontend/OpenMP/OMPConstants.h @@ -107,6 +107,17 @@ inline std::string getAllAssumeClauseOptions() { return S + "'"; } +/// \note This needs to be kept in sync with kmp.h enum sched_type. +/// Todo: Update kmp.h to include this file, and remove the enums in kmp.h +/// To complete this, more enum values will need to be moved here. +enum class OMPScheduleType { + Static = 34, /**< static unspecialized */ + DynamicChunked = 35, + ModifierNonmonotonic = + (1 << 30), /**< Set if the nonmonotonic schedule modifier was present */ + LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierNonmonotonic) +}; + } // end namespace omp } // end namespace llvm diff --git a/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/include/llvm/Frontend/OpenMP/OMPIRBuilder.h index 5a4b4066494..9c657cb6f1d 100644 --- a/include/llvm/Frontend/OpenMP/OMPIRBuilder.h +++ b/include/llvm/Frontend/OpenMP/OMPIRBuilder.h @@ -355,7 +355,7 @@ public: /// \param CLI A descriptor of the canonical loop to workshare. /// \param AllocaIP An insertion point for Alloca instructions usable in the /// preheader of the loop. - /// \param NeedsBarrier Indicates whether a barrier must be insterted after + /// \param NeedsBarrier Indicates whether a barrier must be inserted after /// the loop. /// \param Chunk The size of loop chunk considered as a unit when /// scheduling. If \p nullptr, defaults to 1. @@ -367,6 +367,30 @@ public: bool NeedsBarrier, Value *Chunk = nullptr); + /// Modifies the canonical loop to be a dynamically-scheduled workshare loop. + /// + /// This takes a \p LoopInfo representing a canonical loop, such as the one + /// created by \p createCanonicalLoop and emits additional instructions to + /// turn it into a workshare loop. In particular, it calls to an OpenMP + /// runtime function in the preheader to obtain, and then in each iteration + /// to update the loop counter. + /// \param Loc The source location description, the insertion location + /// is not used. + /// \param CLI A descriptor of the canonical loop to workshare. + /// \param AllocaIP An insertion point for Alloca instructions usable in the + /// preheader of the loop. + /// \param NeedsBarrier Indicates whether a barrier must be insterted after + /// the loop. + /// \param Chunk The size of loop chunk considered as a unit when + /// scheduling. If \p nullptr, defaults to 1. + /// + /// \returns Point where to insert code after the loop. + InsertPointTy createDynamicWorkshareLoop(const LocationDescription &Loc, + CanonicalLoopInfo *CLI, + InsertPointTy AllocaIP, + bool NeedsBarrier, + Value *Chunk = nullptr); + /// Modifies the canonical loop to be a workshare loop. /// /// This takes a \p LoopInfo representing a canonical loop, such as the one diff --git a/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/lib/Frontend/OpenMP/OMPIRBuilder.cpp index ec9ecced08e..de93f644818 100644 --- a/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -1168,10 +1168,8 @@ CanonicalLoopInfo *OpenMPIRBuilder::createStaticWorkshareLoop( Value *ThreadNum = getOrCreateThreadID(SrcLoc); - // TODO: extract scheduling type and map it to OMP constant. This is curently - // happening in kmp.h and its ilk and needs to be moved to OpenMP.td first. - constexpr int StaticSchedType = 34; - Constant *SchedulingType = ConstantInt::get(I32Type, StaticSchedType); + Constant *SchedulingType = + ConstantInt::get(I32Type, static_cast(OMPScheduleType::Static)); // Call the "init" function and update the trip count of the loop with the // value it produced. @@ -1220,6 +1218,148 @@ CanonicalLoopInfo *OpenMPIRBuilder::createWorkshareLoop( return createStaticWorkshareLoop(Loc, CLI, AllocaIP, NeedsBarrier); } +/// Returns an LLVM function to call for initializing loop bounds using OpenMP +/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by +/// the runtime. Always interpret integers as unsigned similarly to +/// CanonicalLoopInfo. +static FunctionCallee +getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) { + unsigned Bitwidth = Ty->getIntegerBitWidth(); + if (Bitwidth == 32) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u); + if (Bitwidth == 64) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u); + llvm_unreachable("unknown OpenMP loop iterator bitwidth"); +} + +/// Returns an LLVM function to call for updating the next loop using OpenMP +/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by +/// the runtime. Always interpret integers as unsigned similarly to +/// CanonicalLoopInfo. +static FunctionCallee +getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) { + unsigned Bitwidth = Ty->getIntegerBitWidth(); + if (Bitwidth == 32) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u); + if (Bitwidth == 64) + return OMPBuilder.getOrCreateRuntimeFunction( + M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u); + llvm_unreachable("unknown OpenMP loop iterator bitwidth"); +} + +OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createDynamicWorkshareLoop( + const LocationDescription &Loc, CanonicalLoopInfo *CLI, + InsertPointTy AllocaIP, bool NeedsBarrier, Value *Chunk) { + // Set up the source location value for OpenMP runtime. + Builder.SetCurrentDebugLocation(Loc.DL); + + Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); + Value *SrcLoc = getOrCreateIdent(SrcLocStr); + + // Declare useful OpenMP runtime functions. + Value *IV = CLI->getIndVar(); + Type *IVTy = IV->getType(); + FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this); + FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this); + + // Allocate space for computed loop bounds as expected by the "init" function. + Builder.restoreIP(AllocaIP); + Type *I32Type = Type::getInt32Ty(M.getContext()); + Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter"); + Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound"); + Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound"); + Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride"); + + // At the end of the preheader, prepare for calling the "init" function by + // storing the current loop bounds into the allocated space. A canonical loop + // always iterates from 0 to trip-count with step 1. Note that "init" expects + // and produces an inclusive upper bound. + BasicBlock *PreHeader = CLI->getPreheader(); + Builder.SetInsertPoint(PreHeader->getTerminator()); + Constant *One = ConstantInt::get(IVTy, 1); + Builder.CreateStore(One, PLowerBound); + Value *UpperBound = CLI->getTripCount(); + Builder.CreateStore(UpperBound, PUpperBound); + Builder.CreateStore(One, PStride); + + BasicBlock *Header = CLI->getHeader(); + BasicBlock *Exit = CLI->getExit(); + BasicBlock *Cond = CLI->getCond(); + InsertPointTy AfterIP = CLI->getAfterIP(); + + // The CLI will be "broken" in the code below, as the loop is no longer + // a valid canonical loop. + + if (!Chunk) + Chunk = One; + + Value *ThreadNum = getOrCreateThreadID(SrcLoc); + + OMPScheduleType DynamicSchedType = + OMPScheduleType::DynamicChunked | OMPScheduleType::ModifierNonmonotonic; + Constant *SchedulingType = + ConstantInt::get(I32Type, static_cast(DynamicSchedType)); + + // Call the "init" function. + Builder.CreateCall(DynamicInit, + {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One, + UpperBound, /* step */ One, Chunk}); + + // An outer loop around the existing one. + BasicBlock *OuterCond = BasicBlock::Create( + PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond", + PreHeader->getParent()); + // This needs to be 32-bit always, so can't use the IVTy Zero above. + Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt()); + Value *Res = + Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter, + PLowerBound, PUpperBound, PStride}); + Constant *Zero32 = ConstantInt::get(I32Type, 0); + Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32); + Value *LowerBound = + Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb"); + Builder.CreateCondBr(MoreWork, Header, Exit); + + // Change PHI-node in loop header to use outer cond rather than preheader, + // and set IV to the LowerBound. + Instruction *Phi = &Header->front(); + auto *PI = cast(Phi); + PI->setIncomingBlock(0, OuterCond); + PI->setIncomingValue(0, LowerBound); + + // Then set the pre-header to jump to the OuterCond + Instruction *Term = PreHeader->getTerminator(); + auto *Br = cast(Term); + Br->setSuccessor(0, OuterCond); + + // Modify the inner condition: + // * Use the UpperBound returned from the DynamicNext call. + // * jump to the loop outer loop when done with one of the inner loops. + Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt()); + UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub"); + Instruction *Comp = &*Builder.GetInsertPoint(); + auto *CI = cast(Comp); + CI->setOperand(1, UpperBound); + // Redirect the inner exit to branch to outer condition. + Instruction *Branch = &Cond->back(); + auto *BI = cast(Branch); + assert(BI->getSuccessor(1) == Exit); + BI->setSuccessor(1, OuterCond); + + // Add the barrier if requested. + if (NeedsBarrier) { + Builder.SetInsertPoint(&Exit->back()); + createBarrier(LocationDescription(Builder.saveIP(), Loc.DL), + omp::Directive::OMPD_for, /* ForceSimpleCall */ false, + /* CheckCancelFlag */ false); + } + + return AfterIP; +} + /// Make \p Source branch to \p Target. /// /// Handles two situations: @@ -1901,7 +2041,7 @@ CallInst *OpenMPIRBuilder::createCachedThreadPrivate( llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache}; Function *Fn = - getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached); + getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached); return Builder.CreateCall(Fn, Args); } diff --git a/unittests/Frontend/OpenMPIRBuilderTest.cpp b/unittests/Frontend/OpenMPIRBuilderTest.cpp index da813671804..77913e67113 100644 --- a/unittests/Frontend/OpenMPIRBuilderTest.cpp +++ b/unittests/Frontend/OpenMPIRBuilderTest.cpp @@ -1708,6 +1708,105 @@ TEST_F(OpenMPIRBuilderTest, StaticWorkShareLoop) { EXPECT_EQ(NumCallsInExitBlock, 3u); } +TEST_F(OpenMPIRBuilderTest, DynamicWorkShareLoop) { + using InsertPointTy = OpenMPIRBuilder::InsertPointTy; + OpenMPIRBuilder OMPBuilder(*M); + OMPBuilder.initialize(); + IRBuilder<> Builder(BB); + OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL}); + + Type *LCTy = Type::getInt32Ty(Ctx); + Value *StartVal = ConstantInt::get(LCTy, 10); + Value *StopVal = ConstantInt::get(LCTy, 52); + Value *StepVal = ConstantInt::get(LCTy, 2); + Value *ChunkVal = ConstantInt::get(LCTy, 7); + auto LoopBodyGen = [&](InsertPointTy, llvm::Value *) {}; + + CanonicalLoopInfo *CLI = OMPBuilder.createCanonicalLoop( + Loc, LoopBodyGen, StartVal, StopVal, StepVal, + /*IsSigned=*/false, /*InclusiveStop=*/false); + + Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); + InsertPointTy AllocaIP = Builder.saveIP(); + + // Collect all the info from CLI, as it isn't usable after the call to + // createDynamicWorkshareLoop. + InsertPointTy AfterIP = CLI->getAfterIP(); + BasicBlock *Preheader = CLI->getPreheader(); + BasicBlock *ExitBlock = CLI->getExit(); + Value *IV = CLI->getIndVar(); + + InsertPointTy EndIP = + OMPBuilder.createDynamicWorkshareLoop(Loc, CLI, AllocaIP, + /*NeedsBarrier=*/true, ChunkVal); + // The returned value should be the "after" point. + ASSERT_EQ(EndIP.getBlock(), AfterIP.getBlock()); + ASSERT_EQ(EndIP.getPoint(), AfterIP.getPoint()); + + auto AllocaIter = BB->begin(); + ASSERT_GE(std::distance(BB->begin(), BB->end()), 4); + AllocaInst *PLastIter = dyn_cast(&*(AllocaIter++)); + AllocaInst *PLowerBound = dyn_cast(&*(AllocaIter++)); + AllocaInst *PUpperBound = dyn_cast(&*(AllocaIter++)); + AllocaInst *PStride = dyn_cast(&*(AllocaIter++)); + EXPECT_NE(PLastIter, nullptr); + EXPECT_NE(PLowerBound, nullptr); + EXPECT_NE(PUpperBound, nullptr); + EXPECT_NE(PStride, nullptr); + + auto PreheaderIter = Preheader->begin(); + ASSERT_GE(std::distance(Preheader->begin(), Preheader->end()), 6); + StoreInst *LowerBoundStore = dyn_cast(&*(PreheaderIter++)); + StoreInst *UpperBoundStore = dyn_cast(&*(PreheaderIter++)); + StoreInst *StrideStore = dyn_cast(&*(PreheaderIter++)); + ASSERT_NE(LowerBoundStore, nullptr); + ASSERT_NE(UpperBoundStore, nullptr); + ASSERT_NE(StrideStore, nullptr); + + CallInst *ThreadIdCall = dyn_cast(&*(PreheaderIter++)); + ASSERT_NE(ThreadIdCall, nullptr); + EXPECT_EQ(ThreadIdCall->getCalledFunction()->getName(), + "__kmpc_global_thread_num"); + + CallInst *InitCall = dyn_cast(&*PreheaderIter); + + ASSERT_NE(InitCall, nullptr); + EXPECT_EQ(InitCall->getCalledFunction()->getName(), + "__kmpc_dispatch_init_4u"); + EXPECT_EQ(InitCall->getNumArgOperands(), 7U); + EXPECT_EQ(InitCall->getArgOperand(6), + ConstantInt::get(Type::getInt32Ty(Ctx), 7)); + + ConstantInt *OrigLowerBound = + dyn_cast(LowerBoundStore->getValueOperand()); + ConstantInt *OrigUpperBound = + dyn_cast(UpperBoundStore->getValueOperand()); + ConstantInt *OrigStride = + dyn_cast(StrideStore->getValueOperand()); + ASSERT_NE(OrigLowerBound, nullptr); + ASSERT_NE(OrigUpperBound, nullptr); + ASSERT_NE(OrigStride, nullptr); + EXPECT_EQ(OrigLowerBound->getValue(), 1); + EXPECT_EQ(OrigUpperBound->getValue(), 21); + EXPECT_EQ(OrigStride->getValue(), 1); + + // The original loop iterator should only be used in the condition, in the + // increment and in the statement that adds the lower bound to it. + EXPECT_EQ(std::distance(IV->use_begin(), IV->use_end()), 3); + + // The exit block should contain the barrier call, plus the call to obtain + // the thread ID. + size_t NumCallsInExitBlock = + count_if(*ExitBlock, [](Instruction &I) { return isa(I); }); + EXPECT_EQ(NumCallsInExitBlock, 2u); + + // Add a termination to our block and check that it is internally consistent. + Builder.restoreIP(EndIP); + Builder.CreateRetVoid(); + OMPBuilder.finalize(); + EXPECT_FALSE(verifyModule(*M, &errs())); +} + TEST_F(OpenMPIRBuilderTest, MasterDirective) { using InsertPointTy = OpenMPIRBuilder::InsertPointTy; OpenMPIRBuilder OMPBuilder(*M);