2019-11-07 06:20:06 +01:00
|
|
|
//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
|
|
|
|
//
|
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// OpenMP specific optimizations:
|
|
|
|
//
|
|
|
|
// - Deduplication of runtime calls, e.g., omp_get_thread_num.
|
2021-06-22 20:40:31 +02:00
|
|
|
// - Replacing globalized device memory with stack memory.
|
|
|
|
// - Replacing globalized device memory with shared memory.
|
2019-11-07 06:20:06 +01:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "llvm/Transforms/IPO/OpenMPOpt.h"
|
|
|
|
|
|
|
|
#include "llvm/ADT/EnumeratedArray.h"
|
2021-04-28 22:22:53 +02:00
|
|
|
#include "llvm/ADT/PostOrderIterator.h"
|
2019-11-07 06:20:06 +01:00
|
|
|
#include "llvm/ADT/Statistic.h"
|
|
|
|
#include "llvm/Analysis/CallGraph.h"
|
|
|
|
#include "llvm/Analysis/CallGraphSCCPass.h"
|
2020-05-13 19:19:02 +02:00
|
|
|
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
|
2020-07-07 23:14:47 +02:00
|
|
|
#include "llvm/Analysis/ValueTracking.h"
|
2019-11-07 06:20:06 +01:00
|
|
|
#include "llvm/Frontend/OpenMP/OMPConstants.h"
|
2020-02-09 01:03:40 +01:00
|
|
|
#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
|
2021-05-20 07:37:29 +02:00
|
|
|
#include "llvm/IR/Assumptions.h"
|
|
|
|
#include "llvm/IR/DiagnosticInfo.h"
|
2021-06-23 23:33:49 +02:00
|
|
|
#include "llvm/IR/GlobalValue.h"
|
2021-05-20 07:37:29 +02:00
|
|
|
#include "llvm/IR/Instruction.h"
|
2021-05-19 02:10:05 +02:00
|
|
|
#include "llvm/IR/IntrinsicInst.h"
|
2019-11-07 06:20:06 +01:00
|
|
|
#include "llvm/InitializePasses.h"
|
|
|
|
#include "llvm/Support/CommandLine.h"
|
|
|
|
#include "llvm/Transforms/IPO.h"
|
2020-06-13 23:57:48 +02:00
|
|
|
#include "llvm/Transforms/IPO/Attributor.h"
|
2020-07-07 23:14:47 +02:00
|
|
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
2019-11-07 06:20:06 +01:00
|
|
|
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
|
2021-01-11 17:03:08 +01:00
|
|
|
#include "llvm/Transforms/Utils/CodeExtractor.h"
|
2019-11-07 06:20:06 +01:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
using namespace omp;
|
|
|
|
|
|
|
|
#define DEBUG_TYPE "openmp-opt"
|
|
|
|
|
|
|
|
static cl::opt<bool> DisableOpenMPOptimizations(
|
|
|
|
"openmp-opt-disable", cl::ZeroOrMore,
|
|
|
|
cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
|
|
|
|
cl::init(false));
|
|
|
|
|
2020-07-07 23:14:47 +02:00
|
|
|
static cl::opt<bool> EnableParallelRegionMerging(
|
|
|
|
"openmp-opt-enable-merging", cl::ZeroOrMore,
|
|
|
|
cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
|
|
|
|
cl::init(false));
|
|
|
|
|
2020-06-19 16:51:35 +02:00
|
|
|
static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
|
|
|
|
cl::Hidden);
|
2020-07-07 02:19:12 +02:00
|
|
|
static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
|
|
|
|
cl::init(false), cl::Hidden);
|
2020-06-19 16:51:35 +02:00
|
|
|
|
2020-08-18 03:18:21 +02:00
|
|
|
static cl::opt<bool> HideMemoryTransferLatency(
|
|
|
|
"openmp-hide-memory-transfer-latency",
|
|
|
|
cl::desc("[WIP] Tries to hide the latency of host to device memory"
|
|
|
|
" transfers"),
|
|
|
|
cl::Hidden, cl::init(false));
|
|
|
|
|
2019-11-07 06:20:06 +01:00
|
|
|
STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
|
|
|
|
"Number of OpenMP runtime calls deduplicated");
|
2020-06-12 17:11:34 +02:00
|
|
|
STATISTIC(NumOpenMPParallelRegionsDeleted,
|
|
|
|
"Number of OpenMP parallel regions deleted");
|
2019-11-07 06:20:06 +01:00
|
|
|
STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
|
|
|
|
"Number of OpenMP runtime functions identified");
|
|
|
|
STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
|
|
|
|
"Number of OpenMP runtime function uses identified");
|
2020-07-07 02:19:12 +02:00
|
|
|
STATISTIC(NumOpenMPTargetRegionKernels,
|
|
|
|
"Number of OpenMP target region entry points (=kernels) identified");
|
2021-06-23 23:33:49 +02:00
|
|
|
STATISTIC(NumOpenMPTargetRegionKernelsSPMD,
|
|
|
|
"Number of OpenMP target region entry points (=kernels) executed in "
|
|
|
|
"SPMD-mode instead of generic-mode");
|
2021-05-20 07:37:29 +02:00
|
|
|
STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
|
|
|
|
"Number of OpenMP target region entry points (=kernels) executed in "
|
|
|
|
"generic-mode without a state machines");
|
|
|
|
STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
|
|
|
|
"Number of OpenMP target region entry points (=kernels) executed in "
|
|
|
|
"generic-mode with customized state machines with fallback");
|
|
|
|
STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
|
|
|
|
"Number of OpenMP target region entry points (=kernels) executed in "
|
|
|
|
"generic-mode with customized state machines without fallback");
|
2020-07-07 02:57:37 +02:00
|
|
|
STATISTIC(
|
|
|
|
NumOpenMPParallelRegionsReplacedInGPUStateMachine,
|
|
|
|
"Number of OpenMP parallel regions replaced with ID in GPU state machines");
|
2020-07-07 23:14:47 +02:00
|
|
|
STATISTIC(NumOpenMPParallelRegionsMerged,
|
|
|
|
"Number of OpenMP parallel regions merged");
|
2021-03-22 21:35:55 +01:00
|
|
|
STATISTIC(NumBytesMovedToSharedMemory,
|
|
|
|
"Amount of memory pushed to shared memory");
|
2019-11-07 06:20:06 +01:00
|
|
|
|
2020-03-13 06:24:38 +01:00
|
|
|
#if !defined(NDEBUG)
|
2019-11-07 06:20:06 +01:00
|
|
|
static constexpr auto TAG = "[" DEBUG_TYPE "]";
|
2020-02-10 13:32:44 +01:00
|
|
|
#endif
|
2019-11-07 06:20:06 +01:00
|
|
|
|
|
|
|
namespace {
|
2020-05-13 19:19:02 +02:00
|
|
|
|
2021-03-22 21:35:55 +01:00
|
|
|
enum class AddressSpace : unsigned {
|
|
|
|
Generic = 0,
|
|
|
|
Global = 1,
|
|
|
|
Shared = 3,
|
|
|
|
Constant = 4,
|
|
|
|
Local = 5,
|
|
|
|
};
|
|
|
|
|
|
|
|
struct AAHeapToShared;
|
|
|
|
|
2020-07-11 01:06:46 +02:00
|
|
|
struct AAICVTracker;
|
|
|
|
|
2020-06-13 23:57:48 +02:00
|
|
|
/// OpenMP specific information. For now, stores RFIs and ICVs also needed for
|
|
|
|
/// Attributor runs.
|
|
|
|
struct OMPInformationCache : public InformationCache {
|
|
|
|
OMPInformationCache(Module &M, AnalysisGetter &AG,
|
2020-07-07 02:26:01 +02:00
|
|
|
BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC,
|
2020-07-07 02:19:12 +02:00
|
|
|
SmallPtrSetImpl<Kernel> &Kernels)
|
2020-07-07 02:26:01 +02:00
|
|
|
: InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
|
|
|
|
Kernels(Kernels) {
|
|
|
|
|
2020-07-02 19:50:39 +02:00
|
|
|
OMPBuilder.initialize();
|
2019-11-07 06:20:06 +01:00
|
|
|
initializeRuntimeFunctions();
|
2020-06-19 16:51:35 +02:00
|
|
|
initializeInternalControlVars();
|
2019-11-07 06:20:06 +01:00
|
|
|
}
|
|
|
|
|
2020-06-19 16:51:35 +02:00
|
|
|
/// Generic information that describes an internal control variable.
|
|
|
|
struct InternalControlVarInfo {
|
|
|
|
/// The kind, as described by InternalControlVar enum.
|
|
|
|
InternalControlVar Kind;
|
|
|
|
|
|
|
|
/// The name of the ICV.
|
|
|
|
StringRef Name;
|
|
|
|
|
|
|
|
/// Environment variable associated with this ICV.
|
|
|
|
StringRef EnvVarName;
|
|
|
|
|
|
|
|
/// Initial value kind.
|
|
|
|
ICVInitValue InitKind;
|
|
|
|
|
|
|
|
/// Initial value.
|
|
|
|
ConstantInt *InitValue;
|
|
|
|
|
|
|
|
/// Setter RTL function associated with this ICV.
|
|
|
|
RuntimeFunction Setter;
|
|
|
|
|
|
|
|
/// Getter RTL function associated with this ICV.
|
|
|
|
RuntimeFunction Getter;
|
|
|
|
|
|
|
|
/// RTL Function corresponding to the override clause of this ICV
|
|
|
|
RuntimeFunction Clause;
|
|
|
|
};
|
|
|
|
|
2019-11-07 06:20:06 +01:00
|
|
|
/// Generic information that describes a runtime function
|
|
|
|
struct RuntimeFunctionInfo {
|
2020-04-21 01:15:08 +02:00
|
|
|
|
2019-11-07 06:20:06 +01:00
|
|
|
/// The kind, as described by the RuntimeFunction enum.
|
|
|
|
RuntimeFunction Kind;
|
|
|
|
|
|
|
|
/// The name of the function.
|
|
|
|
StringRef Name;
|
|
|
|
|
|
|
|
/// Flag to indicate a variadic function.
|
|
|
|
bool IsVarArg;
|
|
|
|
|
|
|
|
/// The return type of the function.
|
|
|
|
Type *ReturnType;
|
|
|
|
|
|
|
|
/// The argument types of the function.
|
|
|
|
SmallVector<Type *, 8> ArgumentTypes;
|
|
|
|
|
|
|
|
/// The declaration if available.
|
2020-03-24 01:14:34 +01:00
|
|
|
Function *Declaration = nullptr;
|
2019-11-07 06:20:06 +01:00
|
|
|
|
|
|
|
/// Uses of this runtime function per function containing the use.
|
2020-04-21 01:15:08 +02:00
|
|
|
using UseVector = SmallVector<Use *, 16>;
|
|
|
|
|
2020-07-11 01:06:46 +02:00
|
|
|
/// Clear UsesMap for runtime function.
|
|
|
|
void clearUsesMap() { UsesMap.clear(); }
|
|
|
|
|
2020-07-07 02:30:14 +02:00
|
|
|
/// Boolean conversion that is true if the runtime function was found.
|
|
|
|
operator bool() const { return Declaration; }
|
|
|
|
|
2020-04-21 01:15:08 +02:00
|
|
|
/// Return the vector of uses in function \p F.
|
|
|
|
UseVector &getOrCreateUseVector(Function *F) {
|
2020-07-11 01:06:46 +02:00
|
|
|
std::shared_ptr<UseVector> &UV = UsesMap[F];
|
2020-04-21 01:15:08 +02:00
|
|
|
if (!UV)
|
2020-07-11 01:06:46 +02:00
|
|
|
UV = std::make_shared<UseVector>();
|
2020-04-21 01:15:08 +02:00
|
|
|
return *UV;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return the vector of uses in function \p F or `nullptr` if there are
|
|
|
|
/// none.
|
|
|
|
const UseVector *getUseVector(Function &F) const {
|
2020-04-28 21:26:09 +02:00
|
|
|
auto I = UsesMap.find(&F);
|
|
|
|
if (I != UsesMap.end())
|
|
|
|
return I->second.get();
|
|
|
|
return nullptr;
|
2020-04-21 01:15:08 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Return how many functions contain uses of this runtime function.
|
|
|
|
size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
|
2019-11-07 06:20:06 +01:00
|
|
|
|
|
|
|
/// Return the number of arguments (or the minimal number for variadic
|
|
|
|
/// functions).
|
|
|
|
size_t getNumArgs() const { return ArgumentTypes.size(); }
|
|
|
|
|
|
|
|
/// Run the callback \p CB on each use and forget the use if the result is
|
|
|
|
/// true. The callback will be fed the function in which the use was
|
|
|
|
/// encountered as second argument.
|
2020-07-07 02:26:01 +02:00
|
|
|
void foreachUse(SmallVectorImpl<Function *> &SCC,
|
|
|
|
function_ref<bool(Use &, Function &)> CB) {
|
|
|
|
for (Function *F : SCC)
|
|
|
|
foreachUse(CB, F);
|
2020-06-16 12:26:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Run the callback \p CB on each use within the function \p F and forget
|
|
|
|
/// the use if the result is true.
|
2020-07-07 02:26:01 +02:00
|
|
|
void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
|
2020-04-21 01:15:08 +02:00
|
|
|
SmallVector<unsigned, 8> ToBeDeleted;
|
2020-06-16 12:26:46 +02:00
|
|
|
ToBeDeleted.clear();
|
|
|
|
|
|
|
|
unsigned Idx = 0;
|
2020-07-07 02:26:01 +02:00
|
|
|
UseVector &UV = getOrCreateUseVector(F);
|
2020-06-16 12:26:46 +02:00
|
|
|
|
|
|
|
for (Use *U : UV) {
|
|
|
|
if (CB(*U, *F))
|
|
|
|
ToBeDeleted.push_back(Idx);
|
|
|
|
++Idx;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove the to-be-deleted indices in reverse order as prior
|
2020-07-07 02:29:23 +02:00
|
|
|
// modifications will not modify the smaller indices.
|
2020-06-16 12:26:46 +02:00
|
|
|
while (!ToBeDeleted.empty()) {
|
|
|
|
unsigned Idx = ToBeDeleted.pop_back_val();
|
|
|
|
UV[Idx] = UV.back();
|
|
|
|
UV.pop_back();
|
2019-11-07 06:20:06 +01:00
|
|
|
}
|
|
|
|
}
|
2020-04-21 01:15:08 +02:00
|
|
|
|
|
|
|
private:
|
|
|
|
/// Map from functions to all uses of this runtime function contained in
|
|
|
|
/// them.
|
2020-07-11 01:06:46 +02:00
|
|
|
DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
|
2021-05-20 07:37:29 +02:00
|
|
|
|
|
|
|
public:
|
|
|
|
/// Iterators for the uses of this runtime function.
|
|
|
|
decltype(UsesMap)::iterator begin() { return UsesMap.begin(); }
|
|
|
|
decltype(UsesMap)::iterator end() { return UsesMap.end(); }
|
2019-11-07 06:20:06 +01:00
|
|
|
};
|
|
|
|
|
2020-06-13 23:57:48 +02:00
|
|
|
/// An OpenMP-IR-Builder instance
|
|
|
|
OpenMPIRBuilder OMPBuilder;
|
|
|
|
|
|
|
|
/// Map from runtime function kind to the runtime function description.
|
|
|
|
EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
|
|
|
|
RuntimeFunction::OMPRTL___last>
|
|
|
|
RFIs;
|
|
|
|
|
2021-05-20 07:37:29 +02:00
|
|
|
/// Map from function declarations/definitions to their runtime enum type.
|
|
|
|
DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;
|
|
|
|
|
2020-06-19 16:51:35 +02:00
|
|
|
/// Map from ICV kind to the ICV description.
|
|
|
|
EnumeratedArray<InternalControlVarInfo, InternalControlVar,
|
|
|
|
InternalControlVar::ICV___last>
|
|
|
|
ICVs;
|
|
|
|
|
|
|
|
/// Helper to initialize all internal control variable information for those
|
|
|
|
/// defined in OMPKinds.def.
|
|
|
|
void initializeInternalControlVars() {
|
|
|
|
#define ICV_RT_SET(_Name, RTL) \
|
|
|
|
{ \
|
|
|
|
auto &ICV = ICVs[_Name]; \
|
|
|
|
ICV.Setter = RTL; \
|
|
|
|
}
|
|
|
|
#define ICV_RT_GET(Name, RTL) \
|
|
|
|
{ \
|
|
|
|
auto &ICV = ICVs[Name]; \
|
|
|
|
ICV.Getter = RTL; \
|
|
|
|
}
|
|
|
|
#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
|
|
|
|
{ \
|
|
|
|
auto &ICV = ICVs[Enum]; \
|
|
|
|
ICV.Name = _Name; \
|
|
|
|
ICV.Kind = Enum; \
|
|
|
|
ICV.InitKind = Init; \
|
|
|
|
ICV.EnvVarName = _EnvVarName; \
|
|
|
|
switch (ICV.InitKind) { \
|
2020-06-26 17:32:06 +02:00
|
|
|
case ICV_IMPLEMENTATION_DEFINED: \
|
2020-06-19 16:51:35 +02:00
|
|
|
ICV.InitValue = nullptr; \
|
|
|
|
break; \
|
2020-06-26 17:32:06 +02:00
|
|
|
case ICV_ZERO: \
|
2020-07-05 15:24:36 +02:00
|
|
|
ICV.InitValue = ConstantInt::get( \
|
|
|
|
Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
|
2020-06-19 16:51:35 +02:00
|
|
|
break; \
|
2020-06-26 17:32:06 +02:00
|
|
|
case ICV_FALSE: \
|
2020-07-05 15:24:36 +02:00
|
|
|
ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
|
2020-06-19 16:51:35 +02:00
|
|
|
break; \
|
2020-06-26 17:32:06 +02:00
|
|
|
case ICV_LAST: \
|
2020-06-19 16:51:35 +02:00
|
|
|
break; \
|
|
|
|
} \
|
|
|
|
}
|
|
|
|
#include "llvm/Frontend/OpenMP/OMPKinds.def"
|
|
|
|
}
|
|
|
|
|
2020-06-13 23:57:48 +02:00
|
|
|
/// Returns true if the function declaration \p F matches the runtime
|
|
|
|
/// function types, that is, return type \p RTFRetType, and argument types
|
|
|
|
/// \p RTFArgTypes.
|
|
|
|
static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
|
|
|
|
SmallVector<Type *, 8> &RTFArgTypes) {
|
|
|
|
// TODO: We should output information to the user (under debug output
|
|
|
|
// and via remarks).
|
|
|
|
|
|
|
|
if (!F)
|
|
|
|
return false;
|
|
|
|
if (F->getReturnType() != RTFRetType)
|
|
|
|
return false;
|
|
|
|
if (F->arg_size() != RTFArgTypes.size())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
auto RTFTyIt = RTFArgTypes.begin();
|
|
|
|
for (Argument &Arg : F->args()) {
|
|
|
|
if (Arg.getType() != *RTFTyIt)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
++RTFTyIt;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-07-07 02:29:23 +02:00
|
|
|
// Helper to collect all uses of the declaration in the UsesMap.
|
2020-07-11 01:06:46 +02:00
|
|
|
unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
|
|
|
|
unsigned NumUses = 0;
|
|
|
|
if (!RFI.Declaration)
|
|
|
|
return NumUses;
|
|
|
|
OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
|
2020-06-13 23:57:48 +02:00
|
|
|
|
2020-07-11 01:06:46 +02:00
|
|
|
if (CollectStats) {
|
2020-06-13 23:57:48 +02:00
|
|
|
NumOpenMPRuntimeFunctionsIdentified += 1;
|
|
|
|
NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
|
2020-07-11 01:06:46 +02:00
|
|
|
}
|
2020-06-13 23:57:48 +02:00
|
|
|
|
2020-07-11 01:06:46 +02:00
|
|
|
// TODO: We directly convert uses into proper calls and unknown uses.
|
|
|
|
for (Use &U : RFI.Declaration->uses()) {
|
|
|
|
if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
|
|
|
|
if (ModuleSlice.count(UserI->getFunction())) {
|
|
|
|
RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
|
2020-06-13 23:57:48 +02:00
|
|
|
++NumUses;
|
|
|
|
}
|
2020-07-11 01:06:46 +02:00
|
|
|
} else {
|
|
|
|
RFI.getOrCreateUseVector(nullptr).push_back(&U);
|
|
|
|
++NumUses;
|
2020-06-13 23:57:48 +02:00
|
|
|
}
|
2020-07-11 01:06:46 +02:00
|
|
|
}
|
|
|
|
return NumUses;
|
|
|
|
}
|
2020-06-13 23:57:48 +02:00
|
|
|
|
2021-01-11 17:03:08 +01:00
|
|
|
// Helper function to recollect uses of a runtime function.
|
|
|
|
void recollectUsesForFunction(RuntimeFunction RTF) {
|
|
|
|
auto &RFI = RFIs[RTF];
|
|
|
|
RFI.clearUsesMap();
|
|
|
|
collectUses(RFI, /*CollectStats*/ false);
|
|
|
|
}
|
|
|
|
|
2020-07-11 01:06:46 +02:00
|
|
|
// Helper function to recollect uses of all runtime functions.
|
|
|
|
void recollectUses() {
|
2021-01-11 17:03:08 +01:00
|
|
|
for (int Idx = 0; Idx < RFIs.size(); ++Idx)
|
|
|
|
recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
|
2020-07-11 01:06:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Helper to initialize all runtime function information for those defined
|
|
|
|
/// in OpenMPKinds.def.
|
|
|
|
void initializeRuntimeFunctions() {
|
2020-06-13 23:57:48 +02:00
|
|
|
Module &M = *((*ModuleSlice.begin())->getParent());
|
|
|
|
|
2020-07-05 15:24:36 +02:00
|
|
|
// Helper macros for handling __VA_ARGS__ in OMP_RTL
|
|
|
|
#define OMP_TYPE(VarName, ...) \
|
|
|
|
Type *VarName = OMPBuilder.VarName; \
|
|
|
|
(void)VarName;
|
|
|
|
|
|
|
|
#define OMP_ARRAY_TYPE(VarName, ...) \
|
|
|
|
ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
|
|
|
|
(void)VarName##Ty; \
|
|
|
|
PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
|
|
|
|
(void)VarName##PtrTy;
|
|
|
|
|
|
|
|
#define OMP_FUNCTION_TYPE(VarName, ...) \
|
|
|
|
FunctionType *VarName = OMPBuilder.VarName; \
|
|
|
|
(void)VarName; \
|
|
|
|
PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
|
|
|
|
(void)VarName##Ptr;
|
|
|
|
|
|
|
|
#define OMP_STRUCT_TYPE(VarName, ...) \
|
|
|
|
StructType *VarName = OMPBuilder.VarName; \
|
|
|
|
(void)VarName; \
|
|
|
|
PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
|
|
|
|
(void)VarName##Ptr;
|
|
|
|
|
2020-06-13 23:57:48 +02:00
|
|
|
#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
|
|
|
|
{ \
|
|
|
|
SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
|
|
|
|
Function *F = M.getFunction(_Name); \
|
2021-07-13 16:01:21 +02:00
|
|
|
RTLFunctions.insert(F); \
|
2020-07-05 15:24:36 +02:00
|
|
|
if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
|
2021-05-20 07:37:29 +02:00
|
|
|
RuntimeFunctionIDMap[F] = _Enum; \
|
2020-06-13 23:57:48 +02:00
|
|
|
auto &RFI = RFIs[_Enum]; \
|
|
|
|
RFI.Kind = _Enum; \
|
|
|
|
RFI.Name = _Name; \
|
|
|
|
RFI.IsVarArg = _IsVarArg; \
|
2020-07-05 15:24:36 +02:00
|
|
|
RFI.ReturnType = OMPBuilder._ReturnType; \
|
2020-06-13 23:57:48 +02:00
|
|
|
RFI.ArgumentTypes = std::move(ArgsTypes); \
|
|
|
|
RFI.Declaration = F; \
|
2020-07-11 01:06:46 +02:00
|
|
|
unsigned NumUses = collectUses(RFI); \
|
2020-06-13 23:57:48 +02:00
|
|
|
(void)NumUses; \
|
|
|
|
LLVM_DEBUG({ \
|
|
|
|
dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
|
|
|
|
<< " found\n"; \
|
|
|
|
if (RFI.Declaration) \
|
|
|
|
dbgs() << TAG << "-> got " << NumUses << " uses in " \
|
|
|
|
<< RFI.getNumFunctionsWithUses() \
|
|
|
|
<< " different functions.\n"; \
|
|
|
|
}); \
|
|
|
|
} \
|
|
|
|
}
|
|
|
|
#include "llvm/Frontend/OpenMP/OMPKinds.def"
|
|
|
|
|
|
|
|
// TODO: We should attach the attributes defined in OMPKinds.def.
|
|
|
|
}
|
2020-07-07 02:19:12 +02:00
|
|
|
|
|
|
|
/// Collection of known kernels (\see Kernel) in the module.
|
|
|
|
SmallPtrSetImpl<Kernel> &Kernels;
|
2021-07-13 16:01:21 +02:00
|
|
|
|
|
|
|
/// Collection of known OpenMP runtime functions..
|
|
|
|
DenseSet<const Function *> RTLFunctions;
|
2020-06-13 23:57:48 +02:00
|
|
|
};
|
|
|
|
|
2021-05-20 07:37:29 +02:00
|
|
|
template <typename Ty, bool InsertInvalidates = true>
|
|
|
|
struct BooleanStateWithPtrSetVector : public BooleanState {
|
|
|
|
|
|
|
|
bool contains(Ty *Elem) const { return Set.contains(Elem); }
|
|
|
|
bool insert(Ty *Elem) {
|
|
|
|
if (InsertInvalidates)
|
|
|
|
BooleanState::indicatePessimisticFixpoint();
|
|
|
|
return Set.insert(Elem);
|
|
|
|
}
|
|
|
|
|
|
|
|
Ty *operator[](int Idx) const { return Set[Idx]; }
|
|
|
|
bool operator==(const BooleanStateWithPtrSetVector &RHS) const {
|
|
|
|
return BooleanState::operator==(RHS) && Set == RHS.Set;
|
|
|
|
}
|
|
|
|
bool operator!=(const BooleanStateWithPtrSetVector &RHS) const {
|
|
|
|
return !(*this == RHS);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool empty() const { return Set.empty(); }
|
|
|
|
size_t size() const { return Set.size(); }
|
|
|
|
|
|
|
|
/// "Clamp" this state with \p RHS.
|
|
|
|
BooleanStateWithPtrSetVector &
|
|
|
|
operator^=(const BooleanStateWithPtrSetVector &RHS) {
|
|
|
|
BooleanState::operator^=(RHS);
|
|
|
|
Set.insert(RHS.Set.begin(), RHS.Set.end());
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
/// A set to keep track of elements.
|
|
|
|
SetVector<Ty *> Set;
|
|
|
|
|
|
|
|
public:
|
|
|
|
typename decltype(Set)::iterator begin() { return Set.begin(); }
|
|
|
|
typename decltype(Set)::iterator end() { return Set.end(); }
|
|
|
|
typename decltype(Set)::const_iterator begin() const { return Set.begin(); }
|
|
|
|
typename decltype(Set)::const_iterator end() const { return Set.end(); }
|
|
|
|
};
|
|
|
|
|
|
|
|
struct KernelInfoState : AbstractState {
|
|
|
|
/// Flag to track if we reached a fixpoint.
|
|
|
|
bool IsAtFixpoint = false;
|
|
|
|
|
|
|
|
/// The parallel regions (identified by the outlined parallel functions) that
|
|
|
|
/// can be reached from the associated function.
|
|
|
|
BooleanStateWithPtrSetVector<Function, /* InsertInvalidates */ false>
|
|
|
|
ReachedKnownParallelRegions;
|
|
|
|
|
|
|
|
/// State to track what parallel region we might reach.
|
|
|
|
BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
|
|
|
|
|
2021-06-23 23:33:49 +02:00
|
|
|
/// State to track if we are in SPMD-mode, assumed or know, and why we decided
|
|
|
|
/// we cannot be.
|
|
|
|
BooleanStateWithPtrSetVector<Instruction> SPMDCompatibilityTracker;
|
|
|
|
|
2021-05-20 07:37:29 +02:00
|
|
|
/// The __kmpc_target_init call in this kernel, if any. If we find more than
|
|
|
|
/// one we abort as the kernel is malformed.
|
|
|
|
CallBase *KernelInitCB = nullptr;
|
|
|
|
|
|
|
|
/// The __kmpc_target_deinit call in this kernel, if any. If we find more than
|
|
|
|
/// one we abort as the kernel is malformed.
|
|
|
|
CallBase *KernelDeinitCB = nullptr;
|
|
|
|
|
[AbstractAttributor] Fold function calls to `__kmpc_is_spmd_exec_mode` if possible
In the device runtime there are many function calls to `__kmpc_is_spmd_exec_mode`
to query the execution mode of current kernels. In many cases, user programs
only contain target region executing in one mode. As a consequence, those runtime
function calls will only return one value. If we can get rid of these function
calls during compliation, it can potentially improve performance.
In this patch, we use `AAKernelInfo` to analyze kernel execution. Basically, for
each kernel (device) function `F`, we collect all kernel entries `K` that can
reach `F`. A new AA, `AAFoldRuntimeCall`, is created for each call site. In each
iteration, it will check all reaching kernel entries, and update the folded value
accordingly.
In the future we will support more function.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D105787
2021-07-16 00:23:12 +02:00
|
|
|
/// Flag to indicate if the associated function is a kernel entry.
|
|
|
|
bool IsKernelEntry = false;
|
|
|
|
|
|
|
|
/// State to track what kernel entries can reach the associated function.
|
|
|
|
BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
|
|
|
|
|
2021-05-20 07:37:29 +02:00
|
|
|
/// Abstract State interface
|
|
|
|
///{
|
|
|
|
|
|
|
|
KernelInfoState() {}
|
|
|
|
KernelInfoState(bool BestState) {
|
|
|
|
if (!BestState)
|
|
|
|
indicatePessimisticFixpoint();
|
|
|
|
}
|
|
|
|
|
|
|
|
/// See AbstractState::isValidState(...)
|
|
|
|
bool isValidState() const override { return true; }
|
|
|
|
|
|
|
|
/// See AbstractState::isAtFixpoint(...)
|
|
|
|
bool isAtFixpoint() const override { return IsAtFixpoint; }
|
|
|
|
|
|
|
|
/// See AbstractState::indicatePessimisticFixpoint(...)
|
|
|
|
ChangeStatus indicatePessimisticFixpoint() override {
|
|
|
|
IsAtFixpoint = true;
|
2021-06-23 23:33:49 +02:00
|
|
|
SPMDCompatibilityTracker.indicatePessimisticFixpoint();
|
2021-05-20 07:37:29 +02:00
|
|
|
ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
|
|
|
|
return ChangeStatus::CHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// See AbstractState::indicateOptimisticFixpoint(...)
|
|
|
|
ChangeStatus indicateOptimisticFixpoint() override {
|
|
|
|
IsAtFixpoint = true;
|
|
|
|
return ChangeStatus::UNCHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return the assumed state
|
|
|
|
KernelInfoState &getAssumed() { return *this; }
|
|
|
|
const KernelInfoState &getAssumed() const { return *this; }
|
|
|
|
|
|
|
|
bool operator==(const KernelInfoState &RHS) const {
|
2021-06-23 23:33:49 +02:00
|
|
|
if (SPMDCompatibilityTracker != RHS.SPMDCompatibilityTracker)
|
|
|
|
return false;
|
2021-05-20 07:37:29 +02:00
|
|
|
if (ReachedKnownParallelRegions != RHS.ReachedKnownParallelRegions)
|
|
|
|
return false;
|
|
|
|
if (ReachedUnknownParallelRegions != RHS.ReachedUnknownParallelRegions)
|
|
|
|
return false;
|
[AbstractAttributor] Fold function calls to `__kmpc_is_spmd_exec_mode` if possible
In the device runtime there are many function calls to `__kmpc_is_spmd_exec_mode`
to query the execution mode of current kernels. In many cases, user programs
only contain target region executing in one mode. As a consequence, those runtime
function calls will only return one value. If we can get rid of these function
calls during compliation, it can potentially improve performance.
In this patch, we use `AAKernelInfo` to analyze kernel execution. Basically, for
each kernel (device) function `F`, we collect all kernel entries `K` that can
reach `F`. A new AA, `AAFoldRuntimeCall`, is created for each call site. In each
iteration, it will check all reaching kernel entries, and update the folded value
accordingly.
In the future we will support more function.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D105787
2021-07-16 00:23:12 +02:00
|
|
|
if (ReachingKernelEntries != RHS.ReachingKernelEntries)
|
|
|
|
return false;
|
2021-05-20 07:37:29 +02:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return empty set as the best state of potential values.
|
|
|
|
static KernelInfoState getBestState() { return KernelInfoState(true); }
|
|
|
|
|
|
|
|
static KernelInfoState getBestState(KernelInfoState &KIS) {
|
|
|
|
return getBestState();
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return full set as the worst state of potential values.
|
|
|
|
static KernelInfoState getWorstState() { return KernelInfoState(false); }
|
|
|
|
|
|
|
|
/// "Clamp" this state with \p KIS.
|
|
|
|
KernelInfoState operator^=(const KernelInfoState &KIS) {
|
|
|
|
// Do not merge two different _init and _deinit call sites.
|
|
|
|
if (KIS.KernelInitCB) {
|
|
|
|
if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
|
|
|
|
indicatePessimisticFixpoint();
|
|
|
|
KernelInitCB = KIS.KernelInitCB;
|
|
|
|
}
|
|
|
|
if (KIS.KernelDeinitCB) {
|
|
|
|
if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
|
|
|
|
indicatePessimisticFixpoint();
|
|
|
|
KernelDeinitCB = KIS.KernelDeinitCB;
|
|
|
|
}
|
2021-06-23 23:33:49 +02:00
|
|
|
SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
|
2021-05-20 07:37:29 +02:00
|
|
|
ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
|
|
|
|
ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
KernelInfoState operator&=(const KernelInfoState &KIS) {
|
|
|
|
return (*this ^= KIS);
|
|
|
|
}
|
|
|
|
|
|
|
|
///}
|
|
|
|
};
|
|
|
|
|
2020-08-31 22:29:22 +02:00
|
|
|
/// Used to map the values physically (in the IR) stored in an offload
|
|
|
|
/// array, to a vector in memory.
|
|
|
|
struct OffloadArray {
|
|
|
|
/// Physical array (in the IR).
|
|
|
|
AllocaInst *Array = nullptr;
|
|
|
|
/// Mapped values.
|
|
|
|
SmallVector<Value *, 8> StoredValues;
|
|
|
|
/// Last stores made in the offload array.
|
|
|
|
SmallVector<StoreInst *, 8> LastAccesses;
|
|
|
|
|
|
|
|
OffloadArray() = default;
|
|
|
|
|
|
|
|
/// Initializes the OffloadArray with the values stored in \p Array before
|
|
|
|
/// instruction \p Before is reached. Returns false if the initialization
|
|
|
|
/// fails.
|
|
|
|
/// This MUST be used immediately after the construction of the object.
|
|
|
|
bool initialize(AllocaInst &Array, Instruction &Before) {
|
|
|
|
if (!Array.getAllocatedType()->isArrayTy())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!getValues(Array, Before))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
this->Array = &Array;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-11-19 17:56:59 +01:00
|
|
|
static const unsigned DeviceIDArgNum = 1;
|
|
|
|
static const unsigned BasePtrsArgNum = 3;
|
|
|
|
static const unsigned PtrsArgNum = 4;
|
|
|
|
static const unsigned SizesArgNum = 5;
|
2020-09-01 01:37:23 +02:00
|
|
|
|
2020-08-31 22:29:22 +02:00
|
|
|
private:
|
|
|
|
/// Traverses the BasicBlock where \p Array is, collecting the stores made to
|
|
|
|
/// \p Array, leaving StoredValues with the values stored before the
|
|
|
|
/// instruction \p Before is reached.
|
|
|
|
bool getValues(AllocaInst &Array, Instruction &Before) {
|
|
|
|
// Initialize container.
|
2020-12-11 01:25:28 +01:00
|
|
|
const uint64_t NumValues = Array.getAllocatedType()->getArrayNumElements();
|
2020-08-31 22:29:22 +02:00
|
|
|
StoredValues.assign(NumValues, nullptr);
|
|
|
|
LastAccesses.assign(NumValues, nullptr);
|
|
|
|
|
|
|
|
// TODO: This assumes the instruction \p Before is in the same
|
|
|
|
// BasicBlock as Array. Make it general, for any control flow graph.
|
|
|
|
BasicBlock *BB = Array.getParent();
|
|
|
|
if (BB != Before.getParent())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
const DataLayout &DL = Array.getModule()->getDataLayout();
|
|
|
|
const unsigned int PointerSize = DL.getPointerSize();
|
|
|
|
|
|
|
|
for (Instruction &I : *BB) {
|
|
|
|
if (&I == &Before)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (!isa<StoreInst>(&I))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
auto *S = cast<StoreInst>(&I);
|
|
|
|
int64_t Offset = -1;
|
2020-12-11 01:25:28 +01:00
|
|
|
auto *Dst =
|
|
|
|
GetPointerBaseWithConstantOffset(S->getPointerOperand(), Offset, DL);
|
2020-08-31 22:29:22 +02:00
|
|
|
if (Dst == &Array) {
|
|
|
|
int64_t Idx = Offset / PointerSize;
|
|
|
|
StoredValues[Idx] = getUnderlyingObject(S->getValueOperand());
|
|
|
|
LastAccesses[Idx] = S;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return isFilled();
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns true if all values in StoredValues and
|
|
|
|
/// LastAccesses are not nullptrs.
|
|
|
|
bool isFilled() {
|
|
|
|
const unsigned NumValues = StoredValues.size();
|
|
|
|
for (unsigned I = 0; I < NumValues; ++I) {
|
|
|
|
if (!StoredValues[I] || !LastAccesses[I])
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2020-06-13 23:57:48 +02:00
|
|
|
struct OpenMPOpt {
|
|
|
|
|
|
|
|
using OptimizationRemarkGetter =
|
|
|
|
function_ref<OptimizationRemarkEmitter &(Function *)>;
|
|
|
|
|
|
|
|
OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
|
|
|
|
OptimizationRemarkGetter OREGetter,
|
2020-07-11 01:06:46 +02:00
|
|
|
OMPInformationCache &OMPInfoCache, Attributor &A)
|
2020-06-17 19:22:54 +02:00
|
|
|
: M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
|
2020-07-11 01:06:46 +02:00
|
|
|
OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
|
2020-06-13 23:57:48 +02:00
|
|
|
|
2020-09-24 18:49:58 +02:00
|
|
|
/// Check if any remarks are enabled for openmp-opt
|
|
|
|
bool remarksEnabled() {
|
|
|
|
auto &Ctx = M.getContext();
|
|
|
|
return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);
|
|
|
|
}
|
|
|
|
|
2019-11-07 06:20:06 +01:00
|
|
|
/// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
|
2021-03-24 15:11:32 +01:00
|
|
|
bool run(bool IsModulePass) {
|
2020-07-07 02:30:14 +02:00
|
|
|
if (SCC.empty())
|
|
|
|
return false;
|
|
|
|
|
2019-11-07 06:20:06 +01:00
|
|
|
bool Changed = false;
|
|
|
|
|
|
|
|
LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
|
2020-06-17 19:22:54 +02:00
|
|
|
<< " functions in a slice with "
|
|
|
|
<< OMPInfoCache.ModuleSlice.size() << " functions\n");
|
2019-11-07 06:20:06 +01:00
|
|
|
|
2021-03-24 15:11:32 +01:00
|
|
|
if (IsModulePass) {
|
2021-05-20 07:37:29 +02:00
|
|
|
Changed |= runAttributor(IsModulePass);
|
2021-04-28 22:22:53 +02:00
|
|
|
|
2021-03-22 21:35:55 +01:00
|
|
|
// Recollect uses, in case Attributor deleted any.
|
|
|
|
OMPInfoCache.recollectUses();
|
|
|
|
|
2021-03-24 15:11:32 +01:00
|
|
|
if (remarksEnabled())
|
|
|
|
analysisGlobalization();
|
|
|
|
} else {
|
|
|
|
if (PrintICVValues)
|
|
|
|
printICVs();
|
|
|
|
if (PrintOpenMPKernels)
|
|
|
|
printKernels();
|
|
|
|
|
2021-05-20 07:37:29 +02:00
|
|
|
Changed |= runAttributor(IsModulePass);
|
2021-03-24 15:11:32 +01:00
|
|
|
|
|
|
|
// Recollect uses, in case Attributor deleted any.
|
|
|
|
OMPInfoCache.recollectUses();
|
|
|
|
|
|
|
|
Changed |= deleteParallelRegions();
|
2021-05-20 07:37:29 +02:00
|
|
|
Changed |= rewriteDeviceCodeStateMachine();
|
|
|
|
|
2021-03-24 15:11:32 +01:00
|
|
|
if (HideMemoryTransferLatency)
|
|
|
|
Changed |= hideMemTransfersLatency();
|
|
|
|
Changed |= deduplicateRuntimeCalls();
|
|
|
|
if (EnableParallelRegionMerging) {
|
|
|
|
if (mergeParallelRegions()) {
|
|
|
|
deduplicateRuntimeCalls();
|
|
|
|
Changed = true;
|
|
|
|
}
|
2020-07-07 23:14:47 +02:00
|
|
|
}
|
|
|
|
}
|
2019-11-07 06:20:06 +01:00
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2020-07-07 02:19:12 +02:00
|
|
|
/// Print initial ICV values for testing.
|
|
|
|
/// FIXME: This should be done from the Attributor once it is added.
|
|
|
|
void printICVs() const {
|
2020-09-29 11:51:36 +02:00
|
|
|
InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
|
|
|
|
ICV_proc_bind};
|
2020-07-07 02:19:12 +02:00
|
|
|
|
|
|
|
for (Function *F : OMPInfoCache.ModuleSlice) {
|
|
|
|
for (auto ICV : ICVs) {
|
|
|
|
auto ICVInfo = OMPInfoCache.ICVs[ICV];
|
2021-05-19 18:19:50 +02:00
|
|
|
auto Remark = [&](OptimizationRemarkAnalysis ORA) {
|
|
|
|
return ORA << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
|
|
|
|
<< " Value: "
|
|
|
|
<< (ICVInfo.InitValue
|
2021-06-11 14:19:00 +02:00
|
|
|
? toString(ICVInfo.InitValue->getValue(), 10, true)
|
2021-05-19 18:19:50 +02:00
|
|
|
: "IMPLEMENTATION_DEFINED");
|
2020-07-07 02:19:12 +02:00
|
|
|
};
|
|
|
|
|
2021-05-19 18:19:50 +02:00
|
|
|
emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPICVTracker", Remark);
|
2020-07-07 02:19:12 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Print OpenMP GPU kernels for testing.
|
|
|
|
void printKernels() const {
|
|
|
|
for (Function *F : SCC) {
|
|
|
|
if (!OMPInfoCache.Kernels.count(F))
|
|
|
|
continue;
|
|
|
|
|
2021-05-19 18:19:50 +02:00
|
|
|
auto Remark = [&](OptimizationRemarkAnalysis ORA) {
|
|
|
|
return ORA << "OpenMP GPU kernel "
|
|
|
|
<< ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
|
2020-07-07 02:19:12 +02:00
|
|
|
};
|
|
|
|
|
2021-05-19 18:19:50 +02:00
|
|
|
emitRemark<OptimizationRemarkAnalysis>(F, "OpenMPGPU", Remark);
|
2020-07-07 02:19:12 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-13 23:57:48 +02:00
|
|
|
/// Return the call if \p U is a callee use in a regular call. If \p RFI is
|
|
|
|
/// given it has to be the callee or a nullptr is returned.
|
|
|
|
static CallInst *getCallIfRegularCall(
|
|
|
|
Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
|
|
|
|
CallInst *CI = dyn_cast<CallInst>(U.getUser());
|
|
|
|
if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() &&
|
|
|
|
(!RFI || CI->getCalledFunction() == RFI->Declaration))
|
|
|
|
return CI;
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return the call if \p V is a regular call. If \p RFI is given it has to be
|
|
|
|
/// the callee or a nullptr is returned.
|
|
|
|
static CallInst *getCallIfRegularCall(
|
|
|
|
Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
|
|
|
|
CallInst *CI = dyn_cast<CallInst>(&V);
|
|
|
|
if (CI && !CI->hasOperandBundles() &&
|
|
|
|
(!RFI || CI->getCalledFunction() == RFI->Declaration))
|
|
|
|
return CI;
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2019-11-07 06:20:06 +01:00
|
|
|
private:
|
2020-07-07 23:14:47 +02:00
|
|
|
/// Merge parallel regions when it is safe.
|
|
|
|
bool mergeParallelRegions() {
|
|
|
|
const unsigned CallbackCalleeOperand = 2;
|
|
|
|
const unsigned CallbackFirstArgOperand = 3;
|
|
|
|
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
|
|
|
|
|
|
|
|
// Check if there are any __kmpc_fork_call calls to merge.
|
|
|
|
OMPInformationCache::RuntimeFunctionInfo &RFI =
|
|
|
|
OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
|
|
|
|
|
|
|
|
if (!RFI.Declaration)
|
|
|
|
return false;
|
|
|
|
|
2021-01-11 17:03:08 +01:00
|
|
|
// Unmergable calls that prevent merging a parallel region.
|
|
|
|
OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
|
|
|
|
OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
|
|
|
|
OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
|
|
|
|
};
|
2020-07-07 23:14:47 +02:00
|
|
|
|
|
|
|
bool Changed = false;
|
|
|
|
LoopInfo *LI = nullptr;
|
|
|
|
DominatorTree *DT = nullptr;
|
|
|
|
|
|
|
|
SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
|
|
|
|
|
|
|
|
BasicBlock *StartBB = nullptr, *EndBB = nullptr;
|
|
|
|
auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
|
|
|
|
BasicBlock &ContinuationIP) {
|
|
|
|
BasicBlock *CGStartBB = CodeGenIP.getBlock();
|
|
|
|
BasicBlock *CGEndBB =
|
|
|
|
SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
|
|
|
|
assert(StartBB != nullptr && "StartBB should not be null");
|
|
|
|
CGStartBB->getTerminator()->setSuccessor(0, StartBB);
|
|
|
|
assert(EndBB != nullptr && "EndBB should not be null");
|
|
|
|
EndBB->getTerminator()->setSuccessor(0, CGEndBB);
|
|
|
|
};
|
|
|
|
|
2020-11-26 18:32:30 +01:00
|
|
|
auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &,
|
|
|
|
Value &Inner, Value *&ReplacementValue) -> InsertPointTy {
|
|
|
|
ReplacementValue = &Inner;
|
2020-07-07 23:14:47 +02:00
|
|
|
return CodeGenIP;
|
|
|
|
};
|
|
|
|
|
|
|
|
auto FiniCB = [&](InsertPointTy CodeGenIP) {};
|
|
|
|
|
2021-01-11 17:03:08 +01:00
|
|
|
/// Create a sequential execution region within a merged parallel region,
|
|
|
|
/// encapsulated in a master construct with a barrier for synchronization.
|
|
|
|
auto CreateSequentialRegion = [&](Function *OuterFn,
|
|
|
|
BasicBlock *OuterPredBB,
|
|
|
|
Instruction *SeqStartI,
|
|
|
|
Instruction *SeqEndI) {
|
|
|
|
// Isolate the instructions of the sequential region to a separate
|
|
|
|
// block.
|
|
|
|
BasicBlock *ParentBB = SeqStartI->getParent();
|
|
|
|
BasicBlock *SeqEndBB =
|
|
|
|
SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
|
|
|
|
BasicBlock *SeqAfterBB =
|
|
|
|
SplitBlock(SeqEndBB, &*SeqEndBB->getFirstInsertionPt(), DT, LI);
|
|
|
|
BasicBlock *SeqStartBB =
|
|
|
|
SplitBlock(ParentBB, SeqStartI, DT, LI, nullptr, "seq.par.merged");
|
|
|
|
|
|
|
|
assert(ParentBB->getUniqueSuccessor() == SeqStartBB &&
|
|
|
|
"Expected a different CFG");
|
|
|
|
const DebugLoc DL = ParentBB->getTerminator()->getDebugLoc();
|
|
|
|
ParentBB->getTerminator()->eraseFromParent();
|
|
|
|
|
|
|
|
auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
|
|
|
|
BasicBlock &ContinuationIP) {
|
|
|
|
BasicBlock *CGStartBB = CodeGenIP.getBlock();
|
|
|
|
BasicBlock *CGEndBB =
|
|
|
|
SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
|
|
|
|
assert(SeqStartBB != nullptr && "SeqStartBB should not be null");
|
|
|
|
CGStartBB->getTerminator()->setSuccessor(0, SeqStartBB);
|
|
|
|
assert(SeqEndBB != nullptr && "SeqEndBB should not be null");
|
|
|
|
SeqEndBB->getTerminator()->setSuccessor(0, CGEndBB);
|
|
|
|
};
|
|
|
|
auto FiniCB = [&](InsertPointTy CodeGenIP) {};
|
|
|
|
|
|
|
|
// Find outputs from the sequential region to outside users and
|
|
|
|
// broadcast their values to them.
|
|
|
|
for (Instruction &I : *SeqStartBB) {
|
|
|
|
SmallPtrSet<Instruction *, 4> OutsideUsers;
|
|
|
|
for (User *Usr : I.users()) {
|
|
|
|
Instruction &UsrI = *cast<Instruction>(Usr);
|
|
|
|
// Ignore outputs to LT intrinsics, code extraction for the merged
|
|
|
|
// parallel region will fix them.
|
|
|
|
if (UsrI.isLifetimeStartOrEnd())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (UsrI.getParent() != SeqStartBB)
|
|
|
|
OutsideUsers.insert(&UsrI);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (OutsideUsers.empty())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Emit an alloca in the outer region to store the broadcasted
|
|
|
|
// value.
|
|
|
|
const DataLayout &DL = M.getDataLayout();
|
|
|
|
AllocaInst *AllocaI = new AllocaInst(
|
|
|
|
I.getType(), DL.getAllocaAddrSpace(), nullptr,
|
|
|
|
I.getName() + ".seq.output.alloc", &OuterFn->front().front());
|
|
|
|
|
|
|
|
// Emit a store instruction in the sequential BB to update the
|
|
|
|
// value.
|
|
|
|
new StoreInst(&I, AllocaI, SeqStartBB->getTerminator());
|
|
|
|
|
|
|
|
// Emit a load instruction and replace the use of the output value
|
|
|
|
// with it.
|
|
|
|
for (Instruction *UsrI : OutsideUsers) {
|
2021-03-02 02:31:42 +01:00
|
|
|
LoadInst *LoadI = new LoadInst(
|
|
|
|
I.getType(), AllocaI, I.getName() + ".seq.output.load", UsrI);
|
2021-01-11 17:03:08 +01:00
|
|
|
UsrI->replaceUsesOfWith(&I, LoadI);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
OpenMPIRBuilder::LocationDescription Loc(
|
|
|
|
InsertPointTy(ParentBB, ParentBB->end()), DL);
|
|
|
|
InsertPointTy SeqAfterIP =
|
|
|
|
OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
|
|
|
|
|
|
|
|
OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
|
|
|
|
|
|
|
|
BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
|
|
|
|
|
|
|
|
LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
|
|
|
|
<< "\n");
|
|
|
|
};
|
|
|
|
|
2020-07-07 23:14:47 +02:00
|
|
|
// Helper to merge the __kmpc_fork_call calls in MergableCIs. They are all
|
|
|
|
// contained in BB and only separated by instructions that can be
|
|
|
|
// redundantly executed in parallel. The block BB is split before the first
|
|
|
|
// call (in MergableCIs) and after the last so the entire region we merge
|
|
|
|
// into a single parallel region is contained in a single basic block
|
|
|
|
// without any other instructions. We use the OpenMPIRBuilder to outline
|
|
|
|
// that block and call the resulting function via __kmpc_fork_call.
|
|
|
|
auto Merge = [&](SmallVectorImpl<CallInst *> &MergableCIs, BasicBlock *BB) {
|
|
|
|
// TODO: Change the interface to allow single CIs expanded, e.g, to
|
|
|
|
// include an outer loop.
|
|
|
|
assert(MergableCIs.size() > 1 && "Assumed multiple mergable CIs");
|
|
|
|
|
|
|
|
auto Remark = [&](OptimizationRemark OR) {
|
2021-07-13 16:01:21 +02:00
|
|
|
OR << "Parallel region merged with parallel region"
|
|
|
|
<< (MergableCIs.size() > 2 ? "s" : "") << " at ";
|
2021-01-18 19:16:36 +01:00
|
|
|
for (auto *CI : llvm::drop_begin(MergableCIs)) {
|
2020-07-07 23:14:47 +02:00
|
|
|
OR << ore::NV("OpenMPParallelMerge", CI->getDebugLoc());
|
|
|
|
if (CI != MergableCIs.back())
|
|
|
|
OR << ", ";
|
|
|
|
}
|
2021-07-13 16:01:21 +02:00
|
|
|
return OR << ".";
|
2020-07-07 23:14:47 +02:00
|
|
|
};
|
|
|
|
|
2021-07-13 21:31:44 +02:00
|
|
|
emitRemark<OptimizationRemark>(MergableCIs.front(), "OMP150", Remark);
|
2020-07-07 23:14:47 +02:00
|
|
|
|
|
|
|
Function *OriginalFn = BB->getParent();
|
|
|
|
LLVM_DEBUG(dbgs() << TAG << "Merge " << MergableCIs.size()
|
|
|
|
<< " parallel regions in " << OriginalFn->getName()
|
|
|
|
<< "\n");
|
|
|
|
|
|
|
|
// Isolate the calls to merge in a separate block.
|
|
|
|
EndBB = SplitBlock(BB, MergableCIs.back()->getNextNode(), DT, LI);
|
|
|
|
BasicBlock *AfterBB =
|
|
|
|
SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
|
|
|
|
StartBB = SplitBlock(BB, MergableCIs.front(), DT, LI, nullptr,
|
|
|
|
"omp.par.merged");
|
|
|
|
|
|
|
|
assert(BB->getUniqueSuccessor() == StartBB && "Expected a different CFG");
|
|
|
|
const DebugLoc DL = BB->getTerminator()->getDebugLoc();
|
|
|
|
BB->getTerminator()->eraseFromParent();
|
|
|
|
|
2021-01-11 17:03:08 +01:00
|
|
|
// Create sequential regions for sequential instructions that are
|
|
|
|
// in-between mergable parallel regions.
|
|
|
|
for (auto *It = MergableCIs.begin(), *End = MergableCIs.end() - 1;
|
|
|
|
It != End; ++It) {
|
|
|
|
Instruction *ForkCI = *It;
|
|
|
|
Instruction *NextForkCI = *(It + 1);
|
|
|
|
|
|
|
|
// Continue if there are not in-between instructions.
|
|
|
|
if (ForkCI->getNextNode() == NextForkCI)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
CreateSequentialRegion(OriginalFn, BB, ForkCI->getNextNode(),
|
|
|
|
NextForkCI->getPrevNode());
|
|
|
|
}
|
|
|
|
|
2020-07-07 23:14:47 +02:00
|
|
|
OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
|
|
|
|
DL);
|
|
|
|
IRBuilder<>::InsertPoint AllocaIP(
|
|
|
|
&OriginalFn->getEntryBlock(),
|
|
|
|
OriginalFn->getEntryBlock().getFirstInsertionPt());
|
|
|
|
// Create the merged parallel region with default proc binding, to
|
|
|
|
// avoid overriding binding settings, and without explicit cancellation.
|
2020-11-10 01:47:41 +01:00
|
|
|
InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel(
|
2020-07-07 23:14:47 +02:00
|
|
|
Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
|
|
|
|
OMP_PROC_BIND_default, /* IsCancellable */ false);
|
|
|
|
BranchInst::Create(AfterBB, AfterIP.getBlock());
|
|
|
|
|
|
|
|
// Perform the actual outlining.
|
2021-03-04 00:15:32 +01:00
|
|
|
OMPInfoCache.OMPBuilder.finalize(OriginalFn,
|
|
|
|
/* AllowExtractorSinking */ true);
|
2020-07-07 23:14:47 +02:00
|
|
|
|
|
|
|
Function *OutlinedFn = MergableCIs.front()->getCaller();
|
|
|
|
|
|
|
|
// Replace the __kmpc_fork_call calls with direct calls to the outlined
|
|
|
|
// callbacks.
|
|
|
|
SmallVector<Value *, 8> Args;
|
|
|
|
for (auto *CI : MergableCIs) {
|
|
|
|
Value *Callee =
|
|
|
|
CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts();
|
|
|
|
FunctionType *FT =
|
|
|
|
cast<FunctionType>(Callee->getType()->getPointerElementType());
|
|
|
|
Args.clear();
|
|
|
|
Args.push_back(OutlinedFn->getArg(0));
|
|
|
|
Args.push_back(OutlinedFn->getArg(1));
|
|
|
|
for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
|
|
|
|
U < E; ++U)
|
|
|
|
Args.push_back(CI->getArgOperand(U));
|
|
|
|
|
|
|
|
CallInst *NewCI = CallInst::Create(FT, Callee, Args, "", CI);
|
|
|
|
if (CI->getDebugLoc())
|
|
|
|
NewCI->setDebugLoc(CI->getDebugLoc());
|
|
|
|
|
|
|
|
// Forward parameter attributes from the callback to the callee.
|
|
|
|
for (unsigned U = CallbackFirstArgOperand, E = CI->getNumArgOperands();
|
|
|
|
U < E; ++U)
|
|
|
|
for (const Attribute &A : CI->getAttributes().getParamAttributes(U))
|
|
|
|
NewCI->addParamAttr(
|
|
|
|
U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
|
|
|
|
|
|
|
|
// Emit an explicit barrier to replace the implicit fork-join barrier.
|
|
|
|
if (CI != MergableCIs.back()) {
|
|
|
|
// TODO: Remove barrier if the merged parallel region includes the
|
|
|
|
// 'nowait' clause.
|
2020-11-10 01:47:41 +01:00
|
|
|
OMPInfoCache.OMPBuilder.createBarrier(
|
2020-07-07 23:14:47 +02:00
|
|
|
InsertPointTy(NewCI->getParent(),
|
|
|
|
NewCI->getNextNode()->getIterator()),
|
|
|
|
OMPD_parallel);
|
|
|
|
}
|
|
|
|
|
|
|
|
CI->eraseFromParent();
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(OutlinedFn != OriginalFn && "Outlining failed");
|
[CGSCC][Coroutine][NewPM] Properly support function splitting/outlining
Previously when trying to support CoroSplit's function splitting, we
added in a hack that simply added the new function's node into the
original function's SCC (https://reviews.llvm.org/D87798). This is
incorrect since it might be in its own SCC.
Now, more similar to the previous design, we have callers explicitly
notify the LazyCallGraph that a function has been split out from another
one.
In order to properly support CoroSplit, there are two ways functions can
be split out.
One is the normal expected "outlining" of one function into a new one.
The new function may only contain references to other functions that the
original did. The original function must reference the new function. The
new function may reference the original function, which can result in
the new function being in the same SCC as the original function. The
weird case is when the original function indirectly references the new
function, but the new function directly calls the original function,
resulting in the new SCC being a parent of the original function's SCC.
This form of function splitting works with CoroSplit's Switch ABI.
The second way of splitting is more specific to CoroSplit. CoroSplit's
Retcon and Async ABIs split the original function into multiple
functions that all reference each other and are referenced by the
original function. In order to keep the LazyCallGraph in a valid state,
all new functions must be processed together, else some nodes won't be
populated. To keep things simple, this only supports the case where all
new edges are ref edges, and every new function references every other
new function. There can be a reference back from any new function to the
original function, putting all functions in the same RefSCC.
This also adds asserts that all nodes in a (Ref)SCC can reach all other
nodes to prevent future incorrect hacks.
The original hacks in https://reviews.llvm.org/D87798 are no longer
necessary since all new functions should have been registered before
calling updateCGAndAnalysisManagerForPass.
This fixes all coroutine tests when opt's -enable-new-pm is true by
default. This also fixes PR48190, which was likely due to the previous
hack breaking SCC invariants.
Reviewed By: rnk
Differential Revision: https://reviews.llvm.org/D93828
2020-12-26 19:25:34 +01:00
|
|
|
CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
|
2020-07-07 23:14:47 +02:00
|
|
|
CGUpdater.reanalyzeFunction(*OriginalFn);
|
|
|
|
|
|
|
|
NumOpenMPParallelRegionsMerged += MergableCIs.size();
|
|
|
|
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Helper function that identifes sequences of
|
|
|
|
// __kmpc_fork_call uses in a basic block.
|
|
|
|
auto DetectPRsCB = [&](Use &U, Function &F) {
|
|
|
|
CallInst *CI = getCallIfRegularCall(U, &RFI);
|
|
|
|
BB2PRMap[CI->getParent()].insert(CI);
|
|
|
|
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
|
|
|
|
BB2PRMap.clear();
|
|
|
|
RFI.foreachUse(SCC, DetectPRsCB);
|
|
|
|
SmallVector<SmallVector<CallInst *, 4>, 4> MergableCIsVector;
|
|
|
|
// Find mergable parallel regions within a basic block that are
|
|
|
|
// safe to merge, that is any in-between instructions can safely
|
|
|
|
// execute in parallel after merging.
|
|
|
|
// TODO: support merging across basic-blocks.
|
|
|
|
for (auto &It : BB2PRMap) {
|
|
|
|
auto &CIs = It.getSecond();
|
|
|
|
if (CIs.size() < 2)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
BasicBlock *BB = It.getFirst();
|
|
|
|
SmallVector<CallInst *, 4> MergableCIs;
|
|
|
|
|
2021-01-11 17:03:08 +01:00
|
|
|
/// Returns true if the instruction is mergable, false otherwise.
|
|
|
|
/// A terminator instruction is unmergable by definition since merging
|
|
|
|
/// works within a BB. Instructions before the mergable region are
|
|
|
|
/// mergable if they are not calls to OpenMP runtime functions that may
|
|
|
|
/// set different execution parameters for subsequent parallel regions.
|
|
|
|
/// Instructions in-between parallel regions are mergable if they are not
|
|
|
|
/// calls to any non-intrinsic function since that may call a non-mergable
|
|
|
|
/// OpenMP runtime function.
|
|
|
|
auto IsMergable = [&](Instruction &I, bool IsBeforeMergableRegion) {
|
|
|
|
// We do not merge across BBs, hence return false (unmergable) if the
|
|
|
|
// instruction is a terminator.
|
|
|
|
if (I.isTerminator())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (!isa<CallInst>(&I))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
CallInst *CI = cast<CallInst>(&I);
|
|
|
|
if (IsBeforeMergableRegion) {
|
|
|
|
Function *CalledFunction = CI->getCalledFunction();
|
|
|
|
if (!CalledFunction)
|
|
|
|
return false;
|
|
|
|
// Return false (unmergable) if the call before the parallel
|
|
|
|
// region calls an explicit affinity (proc_bind) or number of
|
|
|
|
// threads (num_threads) compiler-generated function. Those settings
|
|
|
|
// may be incompatible with following parallel regions.
|
|
|
|
// TODO: ICV tracking to detect compatibility.
|
|
|
|
for (const auto &RFI : UnmergableCallsInfo) {
|
|
|
|
if (CalledFunction == RFI.Declaration)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Return false (unmergable) if there is a call instruction
|
|
|
|
// in-between parallel regions when it is not an intrinsic. It
|
|
|
|
// may call an unmergable OpenMP runtime function in its callpath.
|
|
|
|
// TODO: Keep track of possible OpenMP calls in the callpath.
|
|
|
|
if (!isa<IntrinsicInst>(CI))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
};
|
2020-07-07 23:14:47 +02:00
|
|
|
// Find maximal number of parallel region CIs that are safe to merge.
|
2021-01-11 17:03:08 +01:00
|
|
|
for (auto It = BB->begin(), End = BB->end(); It != End;) {
|
|
|
|
Instruction &I = *It;
|
|
|
|
++It;
|
|
|
|
|
2020-07-07 23:14:47 +02:00
|
|
|
if (CIs.count(&I)) {
|
|
|
|
MergableCIs.push_back(cast<CallInst>(&I));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2021-01-11 17:03:08 +01:00
|
|
|
// Continue expanding if the instruction is mergable.
|
|
|
|
if (IsMergable(I, MergableCIs.empty()))
|
2020-07-07 23:14:47 +02:00
|
|
|
continue;
|
|
|
|
|
2021-01-11 17:03:08 +01:00
|
|
|
// Forward the instruction iterator to skip the next parallel region
|
|
|
|
// since there is an unmergable instruction which can affect it.
|
|
|
|
for (; It != End; ++It) {
|
|
|
|
Instruction &SkipI = *It;
|
|
|
|
if (CIs.count(&SkipI)) {
|
|
|
|
LLVM_DEBUG(dbgs() << TAG << "Skip parallel region " << SkipI
|
|
|
|
<< " due to " << I << "\n");
|
|
|
|
++It;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Store mergable regions found.
|
2020-07-07 23:14:47 +02:00
|
|
|
if (MergableCIs.size() > 1) {
|
|
|
|
MergableCIsVector.push_back(MergableCIs);
|
|
|
|
LLVM_DEBUG(dbgs() << TAG << "Found " << MergableCIs.size()
|
|
|
|
<< " parallel regions in block " << BB->getName()
|
|
|
|
<< " of function " << BB->getParent()->getName()
|
|
|
|
<< "\n";);
|
|
|
|
}
|
|
|
|
|
|
|
|
MergableCIs.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!MergableCIsVector.empty()) {
|
|
|
|
Changed = true;
|
|
|
|
|
|
|
|
for (auto &MergableCIs : MergableCIsVector)
|
|
|
|
Merge(MergableCIs, BB);
|
2021-03-24 15:11:32 +01:00
|
|
|
MergableCIsVector.clear();
|
2020-07-07 23:14:47 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Changed) {
|
2021-01-11 17:03:08 +01:00
|
|
|
/// Re-collect use for fork calls, emitted barrier calls, and
|
|
|
|
/// any emitted master/end_master calls.
|
|
|
|
OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
|
|
|
|
OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
|
|
|
|
OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
|
|
|
|
OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
|
2020-07-07 23:14:47 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2020-03-23 16:47:06 +01:00
|
|
|
/// Try to delete parallel regions if possible.
|
2020-02-09 01:42:24 +01:00
|
|
|
bool deleteParallelRegions() {
|
|
|
|
const unsigned CallbackCalleeOperand = 2;
|
|
|
|
|
2020-06-13 23:57:48 +02:00
|
|
|
OMPInformationCache::RuntimeFunctionInfo &RFI =
|
|
|
|
OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
|
|
|
|
|
2020-02-09 01:42:24 +01:00
|
|
|
if (!RFI.Declaration)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
bool Changed = false;
|
|
|
|
auto DeleteCallCB = [&](Use &U, Function &) {
|
|
|
|
CallInst *CI = getCallIfRegularCall(U);
|
|
|
|
if (!CI)
|
|
|
|
return false;
|
|
|
|
auto *Fn = dyn_cast<Function>(
|
|
|
|
CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts());
|
|
|
|
if (!Fn)
|
|
|
|
return false;
|
|
|
|
if (!Fn->onlyReadsMemory())
|
|
|
|
return false;
|
|
|
|
if (!Fn->hasFnAttribute(Attribute::WillReturn))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "
|
|
|
|
<< CI->getCaller()->getName() << "\n");
|
2020-05-13 19:19:02 +02:00
|
|
|
|
|
|
|
auto Remark = [&](OptimizationRemark OR) {
|
2021-07-13 16:01:21 +02:00
|
|
|
return OR << "Removing parallel region with no side-effects.";
|
2020-05-13 19:19:02 +02:00
|
|
|
};
|
2021-07-13 21:31:44 +02:00
|
|
|
emitRemark<OptimizationRemark>(CI, "OMP160", Remark);
|
2020-05-13 19:19:02 +02:00
|
|
|
|
2020-02-09 01:42:24 +01:00
|
|
|
CGUpdater.removeCallSite(*CI);
|
|
|
|
CI->eraseFromParent();
|
|
|
|
Changed = true;
|
2020-06-12 17:11:34 +02:00
|
|
|
++NumOpenMPParallelRegionsDeleted;
|
2020-02-09 01:42:24 +01:00
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
2020-07-07 02:26:01 +02:00
|
|
|
RFI.foreachUse(SCC, DeleteCallCB);
|
2020-02-09 01:42:24 +01:00
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2020-07-07 02:29:23 +02:00
|
|
|
/// Try to eliminate runtime calls by reusing existing ones.
|
2019-11-07 06:20:06 +01:00
|
|
|
bool deduplicateRuntimeCalls() {
|
|
|
|
bool Changed = false;
|
|
|
|
|
2020-02-09 01:03:40 +01:00
|
|
|
RuntimeFunction DeduplicableRuntimeCallIDs[] = {
|
|
|
|
OMPRTL_omp_get_num_threads,
|
|
|
|
OMPRTL_omp_in_parallel,
|
|
|
|
OMPRTL_omp_get_cancellation,
|
|
|
|
OMPRTL_omp_get_thread_limit,
|
|
|
|
OMPRTL_omp_get_supported_active_levels,
|
|
|
|
OMPRTL_omp_get_level,
|
|
|
|
OMPRTL_omp_get_ancestor_thread_num,
|
|
|
|
OMPRTL_omp_get_team_size,
|
|
|
|
OMPRTL_omp_get_active_level,
|
|
|
|
OMPRTL_omp_in_final,
|
|
|
|
OMPRTL_omp_get_proc_bind,
|
|
|
|
OMPRTL_omp_get_num_places,
|
|
|
|
OMPRTL_omp_get_num_procs,
|
|
|
|
OMPRTL_omp_get_place_num,
|
|
|
|
OMPRTL_omp_get_partition_num_places,
|
|
|
|
OMPRTL_omp_get_partition_place_nums};
|
|
|
|
|
2020-05-25 22:34:08 +02:00
|
|
|
// Global-tid is handled separately.
|
2019-11-07 06:20:06 +01:00
|
|
|
SmallSetVector<Value *, 16> GTIdArgs;
|
|
|
|
collectGlobalThreadIdArguments(GTIdArgs);
|
|
|
|
LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()
|
|
|
|
<< " global thread ID arguments\n");
|
|
|
|
|
|
|
|
for (Function *F : SCC) {
|
2020-02-09 01:03:40 +01:00
|
|
|
for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
|
2020-08-27 14:34:54 +02:00
|
|
|
Changed |= deduplicateRuntimeCalls(
|
|
|
|
*F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
|
2020-02-09 01:03:40 +01:00
|
|
|
|
|
|
|
// __kmpc_global_thread_num is special as we can replace it with an
|
|
|
|
// argument in enough cases to make it worth trying.
|
2019-11-07 06:20:06 +01:00
|
|
|
Value *GTIdArg = nullptr;
|
|
|
|
for (Argument &Arg : F->args())
|
|
|
|
if (GTIdArgs.count(&Arg)) {
|
|
|
|
GTIdArg = &Arg;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
Changed |= deduplicateRuntimeCalls(
|
2020-06-13 23:57:48 +02:00
|
|
|
*F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
|
2019-11-07 06:20:06 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2020-08-18 03:18:21 +02:00
|
|
|
/// Tries to hide the latency of runtime calls that involve host to
|
|
|
|
/// device memory transfers by splitting them into their "issue" and "wait"
|
|
|
|
/// versions. The "issue" is moved upwards as much as possible. The "wait" is
|
|
|
|
/// moved downards as much as possible. The "issue" issues the memory transfer
|
|
|
|
/// asynchronously, returning a handle. The "wait" waits in the returned
|
|
|
|
/// handle for the memory transfer to finish.
|
|
|
|
bool hideMemTransfersLatency() {
|
|
|
|
auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
|
|
|
|
bool Changed = false;
|
|
|
|
auto SplitMemTransfers = [&](Use &U, Function &Decl) {
|
|
|
|
auto *RTCall = getCallIfRegularCall(U, &RFI);
|
|
|
|
if (!RTCall)
|
|
|
|
return false;
|
|
|
|
|
2020-08-31 22:29:22 +02:00
|
|
|
OffloadArray OffloadArrays[3];
|
|
|
|
if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
|
|
|
|
|
2020-08-19 18:03:23 +02:00
|
|
|
// TODO: Check if can be moved upwards.
|
|
|
|
bool WasSplit = false;
|
|
|
|
Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
|
|
|
|
if (WaitMovementPoint)
|
|
|
|
WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
|
|
|
|
|
2020-08-18 03:18:21 +02:00
|
|
|
Changed |= WasSplit;
|
|
|
|
return WasSplit;
|
|
|
|
};
|
|
|
|
RFI.foreachUse(SCC, SplitMemTransfers);
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2020-09-24 18:49:58 +02:00
|
|
|
void analysisGlobalization() {
|
2021-03-22 21:35:55 +01:00
|
|
|
auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
|
2020-09-24 18:49:58 +02:00
|
|
|
|
2021-03-22 21:35:55 +01:00
|
|
|
auto CheckGlobalization = [&](Use &U, Function &Decl) {
|
|
|
|
if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
|
2021-06-22 20:57:52 +02:00
|
|
|
auto Remark = [&](OptimizationRemarkMissed ORM) {
|
|
|
|
return ORM
|
2021-03-22 21:35:55 +01:00
|
|
|
<< "Found thread data sharing on the GPU. "
|
|
|
|
<< "Expect degraded performance due to data globalization.";
|
|
|
|
};
|
2021-07-13 21:31:44 +02:00
|
|
|
emitRemark<OptimizationRemarkMissed>(CI, "OMP112", Remark);
|
2021-03-22 21:35:55 +01:00
|
|
|
}
|
2020-09-24 18:49:58 +02:00
|
|
|
|
2021-03-22 21:35:55 +01:00
|
|
|
return false;
|
|
|
|
};
|
|
|
|
|
|
|
|
RFI.foreachUse(SCC, CheckGlobalization);
|
2020-09-24 18:49:58 +02:00
|
|
|
}
|
|
|
|
|
2020-08-31 22:29:22 +02:00
|
|
|
/// Maps the values stored in the offload arrays passed as arguments to
|
|
|
|
/// \p RuntimeCall into the offload arrays in \p OAs.
|
|
|
|
bool getValuesInOffloadArrays(CallInst &RuntimeCall,
|
|
|
|
MutableArrayRef<OffloadArray> OAs) {
|
|
|
|
assert(OAs.size() == 3 && "Need space for three offload arrays!");
|
|
|
|
|
|
|
|
// A runtime call that involves memory offloading looks something like:
|
|
|
|
// call void @__tgt_target_data_begin_mapper(arg0, arg1,
|
|
|
|
// i8** %offload_baseptrs, i8** %offload_ptrs, i64* %offload_sizes,
|
|
|
|
// ...)
|
|
|
|
// So, the idea is to access the allocas that allocate space for these
|
|
|
|
// offload arrays, offload_baseptrs, offload_ptrs, offload_sizes.
|
|
|
|
// Therefore:
|
|
|
|
// i8** %offload_baseptrs.
|
2020-09-01 01:37:23 +02:00
|
|
|
Value *BasePtrsArg =
|
|
|
|
RuntimeCall.getArgOperand(OffloadArray::BasePtrsArgNum);
|
2020-08-31 22:29:22 +02:00
|
|
|
// i8** %offload_ptrs.
|
2020-09-01 01:37:23 +02:00
|
|
|
Value *PtrsArg = RuntimeCall.getArgOperand(OffloadArray::PtrsArgNum);
|
2020-08-31 22:29:22 +02:00
|
|
|
// i8** %offload_sizes.
|
2020-09-01 01:37:23 +02:00
|
|
|
Value *SizesArg = RuntimeCall.getArgOperand(OffloadArray::SizesArgNum);
|
2020-08-31 22:29:22 +02:00
|
|
|
|
|
|
|
// Get values stored in **offload_baseptrs.
|
|
|
|
auto *V = getUnderlyingObject(BasePtrsArg);
|
|
|
|
if (!isa<AllocaInst>(V))
|
|
|
|
return false;
|
|
|
|
auto *BasePtrsArray = cast<AllocaInst>(V);
|
|
|
|
if (!OAs[0].initialize(*BasePtrsArray, RuntimeCall))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Get values stored in **offload_baseptrs.
|
|
|
|
V = getUnderlyingObject(PtrsArg);
|
|
|
|
if (!isa<AllocaInst>(V))
|
|
|
|
return false;
|
|
|
|
auto *PtrsArray = cast<AllocaInst>(V);
|
|
|
|
if (!OAs[1].initialize(*PtrsArray, RuntimeCall))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Get values stored in **offload_sizes.
|
|
|
|
V = getUnderlyingObject(SizesArg);
|
|
|
|
// If it's a [constant] global array don't analyze it.
|
|
|
|
if (isa<GlobalValue>(V))
|
|
|
|
return isa<Constant>(V);
|
|
|
|
if (!isa<AllocaInst>(V))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
auto *SizesArray = cast<AllocaInst>(V);
|
|
|
|
if (!OAs[2].initialize(*SizesArray, RuntimeCall))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Prints the values in the OffloadArrays \p OAs using LLVM_DEBUG.
|
|
|
|
/// For now this is a way to test that the function getValuesInOffloadArrays
|
|
|
|
/// is working properly.
|
|
|
|
/// TODO: Move this to a unittest when unittests are available for OpenMPOpt.
|
|
|
|
void dumpValuesInOffloadArrays(ArrayRef<OffloadArray> OAs) {
|
|
|
|
assert(OAs.size() == 3 && "There are three offload arrays to debug!");
|
|
|
|
|
|
|
|
LLVM_DEBUG(dbgs() << TAG << " Successfully got offload values:\n");
|
|
|
|
std::string ValuesStr;
|
|
|
|
raw_string_ostream Printer(ValuesStr);
|
|
|
|
std::string Separator = " --- ";
|
|
|
|
|
|
|
|
for (auto *BP : OAs[0].StoredValues) {
|
|
|
|
BP->print(Printer);
|
|
|
|
Printer << Separator;
|
|
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "\t\toffload_baseptrs: " << Printer.str() << "\n");
|
|
|
|
ValuesStr.clear();
|
|
|
|
|
|
|
|
for (auto *P : OAs[1].StoredValues) {
|
|
|
|
P->print(Printer);
|
|
|
|
Printer << Separator;
|
|
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "\t\toffload_ptrs: " << Printer.str() << "\n");
|
|
|
|
ValuesStr.clear();
|
|
|
|
|
|
|
|
for (auto *S : OAs[2].StoredValues) {
|
|
|
|
S->print(Printer);
|
|
|
|
Printer << Separator;
|
|
|
|
}
|
|
|
|
LLVM_DEBUG(dbgs() << "\t\toffload_sizes: " << Printer.str() << "\n");
|
|
|
|
}
|
|
|
|
|
2020-08-19 18:03:23 +02:00
|
|
|
/// Returns the instruction where the "wait" counterpart \p RuntimeCall can be
|
|
|
|
/// moved. Returns nullptr if the movement is not possible, or not worth it.
|
|
|
|
Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
|
|
|
|
// FIXME: This traverses only the BasicBlock where RuntimeCall is.
|
|
|
|
// Make it traverse the CFG.
|
|
|
|
|
|
|
|
Instruction *CurrentI = &RuntimeCall;
|
|
|
|
bool IsWorthIt = false;
|
|
|
|
while ((CurrentI = CurrentI->getNextNode())) {
|
|
|
|
|
|
|
|
// TODO: Once we detect the regions to be offloaded we should use the
|
|
|
|
// alias analysis manager to check if CurrentI may modify one of
|
|
|
|
// the offloaded regions.
|
|
|
|
if (CurrentI->mayHaveSideEffects() || CurrentI->mayReadFromMemory()) {
|
|
|
|
if (IsWorthIt)
|
|
|
|
return CurrentI;
|
|
|
|
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: For now if we move it over anything without side effect
|
|
|
|
// is worth it.
|
|
|
|
IsWorthIt = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return end of BasicBlock.
|
|
|
|
return RuntimeCall.getParent()->getTerminator();
|
|
|
|
}
|
|
|
|
|
2020-08-18 03:18:21 +02:00
|
|
|
/// Splits \p RuntimeCall into its "issue" and "wait" counterparts.
|
2020-08-19 18:03:23 +02:00
|
|
|
bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
|
|
|
|
Instruction &WaitMovementPoint) {
|
2020-09-22 17:45:43 +02:00
|
|
|
// Create stack allocated handle (__tgt_async_info) at the beginning of the
|
|
|
|
// function. Used for storing information of the async transfer, allowing to
|
|
|
|
// wait on it later.
|
2020-08-18 03:18:21 +02:00
|
|
|
auto &IRBuilder = OMPInfoCache.OMPBuilder;
|
2020-09-22 17:45:43 +02:00
|
|
|
auto *F = RuntimeCall.getCaller();
|
|
|
|
Instruction *FirstInst = &(F->getEntryBlock().front());
|
|
|
|
AllocaInst *Handle = new AllocaInst(
|
|
|
|
IRBuilder.AsyncInfo, F->getAddressSpace(), "handle", FirstInst);
|
|
|
|
|
2020-08-18 03:18:21 +02:00
|
|
|
// Add "issue" runtime call declaration:
|
|
|
|
// declare %struct.tgt_async_info @__tgt_target_data_begin_issue(i64, i32,
|
|
|
|
// i8**, i8**, i64*, i64*)
|
|
|
|
FunctionCallee IssueDecl = IRBuilder.getOrCreateRuntimeFunction(
|
|
|
|
M, OMPRTL___tgt_target_data_begin_mapper_issue);
|
|
|
|
|
|
|
|
// Change RuntimeCall call site for its asynchronous version.
|
2020-11-13 19:06:41 +01:00
|
|
|
SmallVector<Value *, 16> Args;
|
2020-08-19 18:03:23 +02:00
|
|
|
for (auto &Arg : RuntimeCall.args())
|
2020-08-18 03:18:21 +02:00
|
|
|
Args.push_back(Arg.get());
|
2020-09-22 17:45:43 +02:00
|
|
|
Args.push_back(Handle);
|
2020-08-18 03:18:21 +02:00
|
|
|
|
|
|
|
CallInst *IssueCallsite =
|
2020-09-22 17:45:43 +02:00
|
|
|
CallInst::Create(IssueDecl, Args, /*NameStr=*/"", &RuntimeCall);
|
2020-08-19 18:03:23 +02:00
|
|
|
RuntimeCall.eraseFromParent();
|
2020-08-18 03:18:21 +02:00
|
|
|
|
|
|
|
// Add "wait" runtime call declaration:
|
|
|
|
// declare void @__tgt_target_data_begin_wait(i64, %struct.__tgt_async_info)
|
|
|
|
FunctionCallee WaitDecl = IRBuilder.getOrCreateRuntimeFunction(
|
|
|
|
M, OMPRTL___tgt_target_data_begin_mapper_wait);
|
|
|
|
|
|
|
|
Value *WaitParams[2] = {
|
2020-11-19 17:56:59 +01:00
|
|
|
IssueCallsite->getArgOperand(
|
|
|
|
OffloadArray::DeviceIDArgNum), // device_id.
|
|
|
|
Handle // handle to wait on.
|
2020-08-18 03:18:21 +02:00
|
|
|
};
|
2020-08-19 18:03:23 +02:00
|
|
|
CallInst::Create(WaitDecl, WaitParams, /*NameStr=*/"", &WaitMovementPoint);
|
2020-08-18 03:18:21 +02:00
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-04-21 01:25:24 +02:00
|
|
|
static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
|
|
|
|
bool GlobalOnly, bool &SingleChoice) {
|
|
|
|
if (CurrentIdent == NextIdent)
|
|
|
|
return CurrentIdent;
|
|
|
|
|
2020-02-20 21:17:43 +01:00
|
|
|
// TODO: Figure out how to actually combine multiple debug locations. For
|
2020-04-21 01:25:24 +02:00
|
|
|
// now we just keep an existing one if there is a single choice.
|
|
|
|
if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
|
|
|
|
SingleChoice = !CurrentIdent;
|
|
|
|
return NextIdent;
|
|
|
|
}
|
2020-02-20 21:17:43 +01:00
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return an `struct ident_t*` value that represents the ones used in the
|
|
|
|
/// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
|
|
|
|
/// return a local `struct ident_t*`. For now, if we cannot find a suitable
|
|
|
|
/// return value we create one from scratch. We also do not yet combine
|
|
|
|
/// information, e.g., the source locations, see combinedIdentStruct.
|
2020-06-13 23:57:48 +02:00
|
|
|
Value *
|
|
|
|
getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
|
|
|
|
Function &F, bool GlobalOnly) {
|
2020-04-21 01:25:24 +02:00
|
|
|
bool SingleChoice = true;
|
2020-02-20 21:17:43 +01:00
|
|
|
Value *Ident = nullptr;
|
|
|
|
auto CombineIdentStruct = [&](Use &U, Function &Caller) {
|
|
|
|
CallInst *CI = getCallIfRegularCall(U, &RFI);
|
|
|
|
if (!CI || &F != &Caller)
|
|
|
|
return false;
|
|
|
|
Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
|
2020-04-21 01:25:24 +02:00
|
|
|
/* GlobalOnly */ true, SingleChoice);
|
2020-02-20 21:17:43 +01:00
|
|
|
return false;
|
|
|
|
};
|
2020-07-07 02:26:01 +02:00
|
|
|
RFI.foreachUse(SCC, CombineIdentStruct);
|
2020-02-20 21:17:43 +01:00
|
|
|
|
2020-04-21 01:25:24 +02:00
|
|
|
if (!Ident || !SingleChoice) {
|
2020-02-20 21:17:43 +01:00
|
|
|
// The IRBuilder uses the insertion block to get to the module, this is
|
|
|
|
// unfortunate but we work around it for now.
|
2020-06-13 23:57:48 +02:00
|
|
|
if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
|
|
|
|
OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
|
2020-02-20 21:17:43 +01:00
|
|
|
&F.getEntryBlock(), F.getEntryBlock().begin()));
|
|
|
|
// Create a fallback location if non was found.
|
|
|
|
// TODO: Use the debug locations of the calls instead.
|
2020-06-13 23:57:48 +02:00
|
|
|
Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr();
|
|
|
|
Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc);
|
2020-02-20 21:17:43 +01:00
|
|
|
}
|
|
|
|
return Ident;
|
|
|
|
}
|
|
|
|
|
2020-07-07 02:29:23 +02:00
|
|
|
/// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
|
2019-11-07 06:20:06 +01:00
|
|
|
/// \p ReplVal if given.
|
2020-06-13 23:57:48 +02:00
|
|
|
bool deduplicateRuntimeCalls(Function &F,
|
|
|
|
OMPInformationCache::RuntimeFunctionInfo &RFI,
|
2019-11-07 06:20:06 +01:00
|
|
|
Value *ReplVal = nullptr) {
|
2020-04-21 01:15:08 +02:00
|
|
|
auto *UV = RFI.getUseVector(F);
|
|
|
|
if (!UV || UV->size() + (ReplVal != nullptr) < 2)
|
2020-04-16 18:53:17 +02:00
|
|
|
return false;
|
|
|
|
|
2020-06-13 23:57:48 +02:00
|
|
|
LLVM_DEBUG(
|
|
|
|
dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name
|
|
|
|
<< (ReplVal ? " with an existing value\n" : "\n") << "\n");
|
|
|
|
|
2020-02-10 06:41:46 +01:00
|
|
|
assert((!ReplVal || (isa<Argument>(ReplVal) &&
|
|
|
|
cast<Argument>(ReplVal)->getParent() == &F)) &&
|
|
|
|
"Unexpected replacement value!");
|
2020-02-20 21:17:43 +01:00
|
|
|
|
|
|
|
// TODO: Use dominance to find a good position instead.
|
2020-07-05 15:24:36 +02:00
|
|
|
auto CanBeMoved = [this](CallBase &CB) {
|
2020-02-20 21:17:43 +01:00
|
|
|
unsigned NumArgs = CB.getNumArgOperands();
|
|
|
|
if (NumArgs == 0)
|
|
|
|
return true;
|
2020-07-05 15:24:36 +02:00
|
|
|
if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
|
2020-02-20 21:17:43 +01:00
|
|
|
return false;
|
|
|
|
for (unsigned u = 1; u < NumArgs; ++u)
|
|
|
|
if (isa<Instruction>(CB.getArgOperand(u)))
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
2019-11-07 06:20:06 +01:00
|
|
|
if (!ReplVal) {
|
2020-04-21 01:15:08 +02:00
|
|
|
for (Use *U : *UV)
|
2019-11-07 06:20:06 +01:00
|
|
|
if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
|
2020-02-20 21:17:43 +01:00
|
|
|
if (!CanBeMoved(*CI))
|
|
|
|
continue;
|
2020-05-13 19:19:02 +02:00
|
|
|
|
2019-11-07 06:20:06 +01:00
|
|
|
CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
|
|
|
|
ReplVal = CI;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!ReplVal)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-02-20 21:17:43 +01:00
|
|
|
// If we use a call as a replacement value we need to make sure the ident is
|
|
|
|
// valid at the new location. For now we just pick a global one, either
|
|
|
|
// existing and used by one of the calls, or created from scratch.
|
|
|
|
if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
|
|
|
|
if (CI->getNumArgOperands() > 0 &&
|
2020-07-05 15:24:36 +02:00
|
|
|
CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
|
2020-02-20 21:17:43 +01:00
|
|
|
Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
|
|
|
|
/* GlobalOnly */ true);
|
|
|
|
CI->setArgOperand(0, Ident);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-11-07 06:20:06 +01:00
|
|
|
bool Changed = false;
|
|
|
|
auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
|
|
|
|
CallInst *CI = getCallIfRegularCall(U, &RFI);
|
|
|
|
if (!CI || CI == ReplVal || &F != &Caller)
|
|
|
|
return false;
|
|
|
|
assert(CI->getCaller() == &F && "Unexpected call!");
|
2020-05-13 19:19:02 +02:00
|
|
|
|
|
|
|
auto Remark = [&](OptimizationRemark OR) {
|
|
|
|
return OR << "OpenMP runtime call "
|
2021-07-13 16:01:21 +02:00
|
|
|
<< ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated.";
|
2020-05-13 19:19:02 +02:00
|
|
|
};
|
2021-07-13 16:01:21 +02:00
|
|
|
if (CI->getDebugLoc())
|
2021-07-13 21:31:44 +02:00
|
|
|
emitRemark<OptimizationRemark>(CI, "OMP170", Remark);
|
2021-07-13 16:01:21 +02:00
|
|
|
else
|
2021-07-13 21:31:44 +02:00
|
|
|
emitRemark<OptimizationRemark>(&F, "OMP170", Remark);
|
2020-05-13 19:19:02 +02:00
|
|
|
|
2019-11-07 06:20:06 +01:00
|
|
|
CGUpdater.removeCallSite(*CI);
|
|
|
|
CI->replaceAllUsesWith(ReplVal);
|
|
|
|
CI->eraseFromParent();
|
|
|
|
++NumOpenMPRuntimeCallsDeduplicated;
|
|
|
|
Changed = true;
|
|
|
|
return true;
|
|
|
|
};
|
2020-07-07 02:26:01 +02:00
|
|
|
RFI.foreachUse(SCC, ReplaceAndDeleteCB);
|
2019-11-07 06:20:06 +01:00
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Collect arguments that represent the global thread id in \p GTIdArgs.
|
|
|
|
void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> >IdArgs) {
|
|
|
|
// TODO: Below we basically perform a fixpoint iteration with a pessimistic
|
|
|
|
// initialization. We could define an AbstractAttribute instead and
|
|
|
|
// run the Attributor here once it can be run as an SCC pass.
|
|
|
|
|
|
|
|
// Helper to check the argument \p ArgNo at all call sites of \p F for
|
|
|
|
// a GTId.
|
|
|
|
auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
|
|
|
|
if (!F.hasLocalLinkage())
|
|
|
|
return false;
|
|
|
|
for (Use &U : F.uses()) {
|
|
|
|
if (CallInst *CI = getCallIfRegularCall(U)) {
|
|
|
|
Value *ArgOp = CI->getArgOperand(ArgNo);
|
|
|
|
if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
|
2020-06-13 23:57:48 +02:00
|
|
|
getCallIfRegularCall(
|
|
|
|
*ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
|
2019-11-07 06:20:06 +01:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Helper to identify uses of a GTId as GTId arguments.
|
|
|
|
auto AddUserArgs = [&](Value >Id) {
|
|
|
|
for (Use &U : GTId.uses())
|
|
|
|
if (CallInst *CI = dyn_cast<CallInst>(U.getUser()))
|
|
|
|
if (CI->isArgOperand(&U))
|
|
|
|
if (Function *Callee = CI->getCalledFunction())
|
|
|
|
if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
|
|
|
|
GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
|
|
|
|
};
|
|
|
|
|
|
|
|
// The argument users of __kmpc_global_thread_num calls are GTIds.
|
2020-06-13 23:57:48 +02:00
|
|
|
OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
|
|
|
|
OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
|
|
|
|
|
2020-07-07 02:26:01 +02:00
|
|
|
GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
|
2020-04-21 01:15:08 +02:00
|
|
|
if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
|
|
|
|
AddUserArgs(*CI);
|
|
|
|
return false;
|
|
|
|
});
|
2019-11-07 06:20:06 +01:00
|
|
|
|
|
|
|
// Transitively search for more arguments by looking at the users of the
|
|
|
|
// ones we know already. During the search the GTIdArgs vector is extended
|
|
|
|
// so we cannot cache the size nor can we use a range based for.
|
|
|
|
for (unsigned u = 0; u < GTIdArgs.size(); ++u)
|
|
|
|
AddUserArgs(*GTIdArgs[u]);
|
|
|
|
}
|
|
|
|
|
2020-07-07 02:57:37 +02:00
|
|
|
/// Kernel (=GPU) optimizations and utility functions
|
|
|
|
///
|
|
|
|
///{{
|
|
|
|
|
|
|
|
/// Check if \p F is a kernel, hence entry point for target offloading.
|
|
|
|
bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
|
|
|
|
|
|
|
|
/// Cache to remember the unique kernel for a function.
|
|
|
|
DenseMap<Function *, Optional<Kernel>> UniqueKernelMap;
|
|
|
|
|
|
|
|
/// Find the unique kernel that will execute \p F, if any.
|
|
|
|
Kernel getUniqueKernelFor(Function &F);
|
|
|
|
|
|
|
|
/// Find the unique kernel that will execute \p I, if any.
|
|
|
|
Kernel getUniqueKernelFor(Instruction &I) {
|
|
|
|
return getUniqueKernelFor(*I.getFunction());
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
|
|
|
|
/// the cases we can avoid taking the address of a function.
|
|
|
|
bool rewriteDeviceCodeStateMachine();
|
|
|
|
|
|
|
|
///
|
|
|
|
///}}
|
|
|
|
|
2020-05-13 19:19:02 +02:00
|
|
|
/// Emit a remark generically
|
|
|
|
///
|
|
|
|
/// This template function can be used to generically emit a remark. The
|
|
|
|
/// RemarkKind should be one of the following:
|
|
|
|
/// - OptimizationRemark to indicate a successful optimization attempt
|
|
|
|
/// - OptimizationRemarkMissed to report a failed optimization attempt
|
|
|
|
/// - OptimizationRemarkAnalysis to provide additional information about an
|
|
|
|
/// optimization attempt
|
|
|
|
///
|
|
|
|
/// The remark is built using a callback function provided by the caller that
|
|
|
|
/// takes a RemarkKind as input and returns a RemarkKind.
|
2021-05-19 18:19:50 +02:00
|
|
|
template <typename RemarkKind, typename RemarkCallBack>
|
|
|
|
void emitRemark(Instruction *I, StringRef RemarkName,
|
2020-07-07 02:19:12 +02:00
|
|
|
RemarkCallBack &&RemarkCB) const {
|
2021-05-19 18:19:50 +02:00
|
|
|
Function *F = I->getParent()->getParent();
|
2020-05-13 19:19:02 +02:00
|
|
|
auto &ORE = OREGetter(F);
|
|
|
|
|
2021-07-13 21:31:44 +02:00
|
|
|
if (RemarkName.startswith("OMP"))
|
|
|
|
ORE.emit([&]() {
|
|
|
|
return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I))
|
|
|
|
<< " [" << RemarkName << "]";
|
|
|
|
});
|
|
|
|
else
|
|
|
|
ORE.emit(
|
|
|
|
[&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, I)); });
|
2020-05-13 19:19:02 +02:00
|
|
|
}
|
|
|
|
|
2021-05-19 18:19:50 +02:00
|
|
|
/// Emit a remark on a function.
|
|
|
|
template <typename RemarkKind, typename RemarkCallBack>
|
|
|
|
void emitRemark(Function *F, StringRef RemarkName,
|
|
|
|
RemarkCallBack &&RemarkCB) const {
|
2020-06-19 16:51:35 +02:00
|
|
|
auto &ORE = OREGetter(F);
|
|
|
|
|
2021-07-13 21:31:44 +02:00
|
|
|
if (RemarkName.startswith("OMP"))
|
|
|
|
ORE.emit([&]() {
|
|
|
|
return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F))
|
|
|
|
<< " [" << RemarkName << "]";
|
|
|
|
});
|
|
|
|
else
|
|
|
|
ORE.emit(
|
|
|
|
[&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, F)); });
|
2020-06-19 16:51:35 +02:00
|
|
|
}
|
|
|
|
|
2020-07-07 02:29:23 +02:00
|
|
|
/// The underlying module.
|
2019-11-07 06:20:06 +01:00
|
|
|
Module &M;
|
|
|
|
|
|
|
|
/// The SCC we are operating on.
|
2020-04-21 00:51:38 +02:00
|
|
|
SmallVectorImpl<Function *> &SCC;
|
2019-11-07 06:20:06 +01:00
|
|
|
|
|
|
|
/// Callback to update the call graph, the first argument is a removed call,
|
|
|
|
/// the second an optional replacement call.
|
|
|
|
CallGraphUpdater &CGUpdater;
|
|
|
|
|
2020-05-13 19:19:02 +02:00
|
|
|
/// Callback to get an OptimizationRemarkEmitter from a Function *
|
|
|
|
OptimizationRemarkGetter OREGetter;
|
|
|
|
|
2020-06-13 23:57:48 +02:00
|
|
|
/// OpenMP-specific information cache. Also Used for Attributor runs.
|
|
|
|
OMPInformationCache &OMPInfoCache;
|
2020-07-11 01:06:46 +02:00
|
|
|
|
|
|
|
/// Attributor instance.
|
|
|
|
Attributor &A;
|
|
|
|
|
|
|
|
/// Helper function to run Attributor on SCC.
|
2021-05-20 07:37:29 +02:00
|
|
|
bool runAttributor(bool IsModulePass) {
|
2020-07-11 01:06:46 +02:00
|
|
|
if (SCC.empty())
|
|
|
|
return false;
|
|
|
|
|
2021-05-20 07:37:29 +02:00
|
|
|
registerAAs(IsModulePass);
|
2020-07-11 01:06:46 +02:00
|
|
|
|
|
|
|
ChangeStatus Changed = A.run();
|
|
|
|
|
|
|
|
LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
|
|
|
|
<< " functions, result: " << Changed << ".\n");
|
|
|
|
|
|
|
|
return Changed == ChangeStatus::CHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Populate the Attributor with abstract attribute opportunities in the
|
|
|
|
/// function.
|
2021-05-20 07:37:29 +02:00
|
|
|
void registerAAs(bool IsModulePass);
|
2020-07-11 01:06:46 +02:00
|
|
|
};
|
|
|
|
|
2020-07-07 02:57:37 +02:00
|
|
|
Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
|
|
|
|
if (!OMPInfoCache.ModuleSlice.count(&F))
|
|
|
|
return nullptr;
|
|
|
|
|
|
|
|
// Use a scope to keep the lifetime of the CachedKernel short.
|
|
|
|
{
|
|
|
|
Optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
|
|
|
|
if (CachedKernel)
|
|
|
|
return *CachedKernel;
|
|
|
|
|
|
|
|
// TODO: We should use an AA to create an (optimistic and callback
|
|
|
|
// call-aware) call graph. For now we stick to simple patterns that
|
|
|
|
// are less powerful, basically the worst fixpoint.
|
|
|
|
if (isKernel(F)) {
|
|
|
|
CachedKernel = Kernel(&F);
|
|
|
|
return *CachedKernel;
|
|
|
|
}
|
|
|
|
|
|
|
|
CachedKernel = nullptr;
|
2020-12-17 03:29:26 +01:00
|
|
|
if (!F.hasLocalLinkage()) {
|
|
|
|
|
|
|
|
// See https://openmp.llvm.org/remarks/OptimizationRemarks.html
|
2021-05-19 18:19:50 +02:00
|
|
|
auto Remark = [&](OptimizationRemarkAnalysis ORA) {
|
2021-07-13 16:01:21 +02:00
|
|
|
return ORA << "Potentially unknown OpenMP target region caller.";
|
2020-12-17 03:29:26 +01:00
|
|
|
};
|
2021-05-19 18:19:50 +02:00
|
|
|
emitRemark<OptimizationRemarkAnalysis>(&F, "OMP100", Remark);
|
2020-12-17 03:29:26 +01:00
|
|
|
|
2020-07-07 02:57:37 +02:00
|
|
|
return nullptr;
|
2020-12-17 03:29:26 +01:00
|
|
|
}
|
2020-07-07 02:57:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
|
|
|
|
if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
|
|
|
|
// Allow use in equality comparisons.
|
|
|
|
if (Cmp->isEquality())
|
|
|
|
return getUniqueKernelFor(*Cmp);
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
if (auto *CB = dyn_cast<CallBase>(U.getUser())) {
|
|
|
|
// Allow direct calls.
|
|
|
|
if (CB->isCallee(&U))
|
|
|
|
return getUniqueKernelFor(*CB);
|
2021-04-21 20:41:31 +02:00
|
|
|
|
|
|
|
OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
|
|
|
|
OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
|
|
|
|
// Allow the use in __kmpc_parallel_51 calls.
|
|
|
|
if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI))
|
|
|
|
return getUniqueKernelFor(*CB);
|
2020-07-07 02:57:37 +02:00
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
// Disallow every other use.
|
|
|
|
return nullptr;
|
|
|
|
};
|
|
|
|
|
|
|
|
// TODO: In the future we want to track more than just a unique kernel.
|
|
|
|
SmallPtrSet<Kernel, 2> PotentialKernels;
|
2020-08-30 10:29:07 +02:00
|
|
|
OMPInformationCache::foreachUse(F, [&](const Use &U) {
|
2020-07-07 02:57:37 +02:00
|
|
|
PotentialKernels.insert(GetUniqueKernelForUse(U));
|
|
|
|
});
|
|
|
|
|
|
|
|
Kernel K = nullptr;
|
|
|
|
if (PotentialKernels.size() == 1)
|
|
|
|
K = *PotentialKernels.begin();
|
|
|
|
|
|
|
|
// Cache the result.
|
|
|
|
UniqueKernelMap[&F] = K;
|
|
|
|
|
|
|
|
return K;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
|
2021-04-21 20:41:31 +02:00
|
|
|
OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
|
|
|
|
OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
|
2020-07-07 02:57:37 +02:00
|
|
|
|
|
|
|
bool Changed = false;
|
2021-04-21 20:41:31 +02:00
|
|
|
if (!KernelParallelRFI)
|
2020-07-07 02:57:37 +02:00
|
|
|
return Changed;
|
|
|
|
|
|
|
|
for (Function *F : SCC) {
|
|
|
|
|
2021-04-21 20:41:31 +02:00
|
|
|
// Check if the function is a use in a __kmpc_parallel_51 call at
|
2020-07-07 02:57:37 +02:00
|
|
|
// all.
|
|
|
|
bool UnknownUse = false;
|
2021-04-21 20:41:31 +02:00
|
|
|
bool KernelParallelUse = false;
|
2020-07-07 02:57:37 +02:00
|
|
|
unsigned NumDirectCalls = 0;
|
|
|
|
|
|
|
|
SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
|
2020-08-30 10:29:07 +02:00
|
|
|
OMPInformationCache::foreachUse(*F, [&](Use &U) {
|
2020-07-07 02:57:37 +02:00
|
|
|
if (auto *CB = dyn_cast<CallBase>(U.getUser()))
|
|
|
|
if (CB->isCallee(&U)) {
|
|
|
|
++NumDirectCalls;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2020-07-11 16:09:09 +02:00
|
|
|
if (isa<ICmpInst>(U.getUser())) {
|
2020-07-07 02:57:37 +02:00
|
|
|
ToBeReplacedStateMachineUses.push_back(&U);
|
|
|
|
return;
|
|
|
|
}
|
2021-04-21 20:41:31 +02:00
|
|
|
|
|
|
|
// Find wrapper functions that represent parallel kernels.
|
|
|
|
CallInst *CI =
|
|
|
|
OpenMPOpt::getCallIfRegularCall(*U.getUser(), &KernelParallelRFI);
|
|
|
|
const unsigned int WrapperFunctionArgNo = 6;
|
|
|
|
if (!KernelParallelUse && CI &&
|
|
|
|
CI->getArgOperandNo(&U) == WrapperFunctionArgNo) {
|
|
|
|
KernelParallelUse = true;
|
2020-07-07 02:57:37 +02:00
|
|
|
ToBeReplacedStateMachineUses.push_back(&U);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
UnknownUse = true;
|
|
|
|
});
|
|
|
|
|
2021-04-21 20:41:31 +02:00
|
|
|
// Do not emit a remark if we haven't seen a __kmpc_parallel_51
|
2020-07-15 02:11:30 +02:00
|
|
|
// use.
|
2021-04-21 20:41:31 +02:00
|
|
|
if (!KernelParallelUse)
|
2020-07-07 02:57:37 +02:00
|
|
|
continue;
|
|
|
|
|
2020-07-15 02:11:30 +02:00
|
|
|
// If this ever hits, we should investigate.
|
|
|
|
// TODO: Checking the number of uses is not a necessary restriction and
|
|
|
|
// should be lifted.
|
|
|
|
if (UnknownUse || NumDirectCalls != 1 ||
|
2021-05-20 07:37:29 +02:00
|
|
|
ToBeReplacedStateMachineUses.size() > 2) {
|
2021-07-13 16:01:21 +02:00
|
|
|
auto Remark = [&](OptimizationRemarkAnalysis ORA) {
|
|
|
|
return ORA << "Parallel region is used in "
|
|
|
|
<< (UnknownUse ? "unknown" : "unexpected")
|
|
|
|
<< " ways. Will not attempt to rewrite the state machine.";
|
|
|
|
};
|
2021-07-13 21:31:44 +02:00
|
|
|
emitRemark<OptimizationRemarkAnalysis>(F, "OMP101", Remark);
|
2020-07-07 02:57:37 +02:00
|
|
|
continue;
|
2020-07-15 02:11:30 +02:00
|
|
|
}
|
2020-07-07 02:57:37 +02:00
|
|
|
|
2021-04-21 20:41:31 +02:00
|
|
|
// Even if we have __kmpc_parallel_51 calls, we (for now) give
|
2020-07-07 02:57:37 +02:00
|
|
|
// up if the function is not called from a unique kernel.
|
|
|
|
Kernel K = getUniqueKernelFor(*F);
|
2020-07-15 02:11:30 +02:00
|
|
|
if (!K) {
|
2021-07-13 16:01:21 +02:00
|
|
|
auto Remark = [&](OptimizationRemarkAnalysis ORA) {
|
|
|
|
return ORA << "Parallel region is not called from a unique kernel. "
|
|
|
|
"Will not attempt to rewrite the state machine.";
|
|
|
|
};
|
2021-07-13 21:31:44 +02:00
|
|
|
emitRemark<OptimizationRemarkAnalysis>(F, "OMP102", Remark);
|
2020-07-07 02:57:37 +02:00
|
|
|
continue;
|
2020-07-15 02:11:30 +02:00
|
|
|
}
|
2020-07-07 02:57:37 +02:00
|
|
|
|
|
|
|
// We now know F is a parallel body function called only from the kernel K.
|
|
|
|
// We also identified the state machine uses in which we replace the
|
|
|
|
// function pointer by a new global symbol for identification purposes. This
|
|
|
|
// ensures only direct calls to the function are left.
|
|
|
|
|
|
|
|
Module &M = *F->getParent();
|
|
|
|
Type *Int8Ty = Type::getInt8Ty(M.getContext());
|
|
|
|
|
|
|
|
auto *ID = new GlobalVariable(
|
|
|
|
M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
|
|
|
|
UndefValue::get(Int8Ty), F->getName() + ".ID");
|
|
|
|
|
|
|
|
for (Use *U : ToBeReplacedStateMachineUses)
|
|
|
|
U->set(ConstantExpr::getBitCast(ID, U->get()->getType()));
|
|
|
|
|
|
|
|
++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
|
|
|
|
|
|
|
|
Changed = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2020-07-11 01:06:46 +02:00
|
|
|
/// Abstract Attribute for tracking ICV values.
|
|
|
|
struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
|
|
|
|
using Base = StateWrapper<BooleanState, AbstractAttribute>;
|
|
|
|
AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
|
|
|
|
|
2020-08-30 11:27:48 +02:00
|
|
|
void initialize(Attributor &A) override {
|
|
|
|
Function *F = getAnchorScope();
|
|
|
|
if (!F || !A.isFunctionIPOAmendable(*F))
|
|
|
|
indicatePessimisticFixpoint();
|
|
|
|
}
|
|
|
|
|
2020-07-11 01:06:46 +02:00
|
|
|
/// Returns true if value is assumed to be tracked.
|
|
|
|
bool isAssumedTracked() const { return getAssumed(); }
|
|
|
|
|
|
|
|
/// Returns true if value is known to be tracked.
|
|
|
|
bool isKnownTracked() const { return getAssumed(); }
|
|
|
|
|
|
|
|
/// Create an abstract attribute biew for the position \p IRP.
|
|
|
|
static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
|
|
|
|
|
|
|
|
/// Return the value with which \p I can be replaced for specific \p ICV.
|
2020-08-30 11:27:48 +02:00
|
|
|
virtual Optional<Value *> getReplacementValue(InternalControlVar ICV,
|
|
|
|
const Instruction *I,
|
|
|
|
Attributor &A) const {
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return an assumed unique ICV value if a single candidate is found. If
|
|
|
|
/// there cannot be one, return a nullptr. If it is not clear yet, return the
|
|
|
|
/// Optional::NoneType.
|
|
|
|
virtual Optional<Value *>
|
|
|
|
getUniqueReplacementValue(InternalControlVar ICV) const = 0;
|
|
|
|
|
|
|
|
// Currently only nthreads is being tracked.
|
|
|
|
// this array will only grow with time.
|
|
|
|
InternalControlVar TrackableICVs[1] = {ICV_nthreads};
|
2020-07-11 01:06:46 +02:00
|
|
|
|
|
|
|
/// See AbstractAttribute::getName()
|
|
|
|
const std::string getName() const override { return "AAICVTracker"; }
|
|
|
|
|
2020-07-15 03:31:00 +02:00
|
|
|
/// See AbstractAttribute::getIdAddr()
|
|
|
|
const char *getIdAddr() const override { return &ID; }
|
|
|
|
|
|
|
|
/// This function should return true if the type of the \p AA is AAICVTracker
|
|
|
|
static bool classof(const AbstractAttribute *AA) {
|
|
|
|
return (AA->getIdAddr() == &ID);
|
|
|
|
}
|
|
|
|
|
2020-07-11 01:06:46 +02:00
|
|
|
static const char ID;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct AAICVTrackerFunction : public AAICVTracker {
|
|
|
|
AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
|
|
|
|
: AAICVTracker(IRP, A) {}
|
|
|
|
|
|
|
|
// FIXME: come up with better string.
|
2020-08-30 11:27:48 +02:00
|
|
|
const std::string getAsStr() const override { return "ICVTrackerFunction"; }
|
2020-07-11 01:06:46 +02:00
|
|
|
|
|
|
|
// FIXME: come up with some stats.
|
|
|
|
void trackStatistics() const override {}
|
|
|
|
|
2020-08-30 11:27:48 +02:00
|
|
|
/// We don't manifest anything for this AA.
|
2020-07-11 01:06:46 +02:00
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
2020-08-30 11:27:48 +02:00
|
|
|
return ChangeStatus::UNCHANGED;
|
2020-07-11 01:06:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Map of ICV to their values at specific program point.
|
2020-08-30 11:27:48 +02:00
|
|
|
EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar,
|
2020-07-11 01:06:46 +02:00
|
|
|
InternalControlVar::ICV___last>
|
2020-08-30 11:27:48 +02:00
|
|
|
ICVReplacementValuesMap;
|
2020-07-11 01:06:46 +02:00
|
|
|
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
|
|
ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
|
|
|
|
|
|
|
|
Function *F = getAnchorScope();
|
|
|
|
|
|
|
|
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
|
|
|
|
|
|
|
|
for (InternalControlVar ICV : TrackableICVs) {
|
|
|
|
auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
|
|
|
|
|
2020-08-30 11:27:48 +02:00
|
|
|
auto &ValuesMap = ICVReplacementValuesMap[ICV];
|
2020-07-11 01:06:46 +02:00
|
|
|
auto TrackValues = [&](Use &U, Function &) {
|
|
|
|
CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
|
|
|
|
if (!CI)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// FIXME: handle setters with more that 1 arguments.
|
|
|
|
/// Track new value.
|
2020-08-30 11:27:48 +02:00
|
|
|
if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second)
|
2020-07-11 01:06:46 +02:00
|
|
|
HasChanged = ChangeStatus::CHANGED;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
|
2020-08-30 11:27:48 +02:00
|
|
|
auto CallCheck = [&](Instruction &I) {
|
|
|
|
Optional<Value *> ReplVal = getValueForCall(A, &I, ICV);
|
|
|
|
if (ReplVal.hasValue() &&
|
|
|
|
ValuesMap.insert(std::make_pair(&I, *ReplVal)).second)
|
|
|
|
HasChanged = ChangeStatus::CHANGED;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Track all changes of an ICV.
|
2020-07-11 01:06:46 +02:00
|
|
|
SetterRFI.foreachUse(TrackValues, F);
|
2020-08-30 11:27:48 +02:00
|
|
|
|
2021-07-10 02:09:40 +02:00
|
|
|
bool UsedAssumedInformation = false;
|
2020-08-30 11:27:48 +02:00
|
|
|
A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},
|
2021-07-10 02:09:40 +02:00
|
|
|
UsedAssumedInformation,
|
2020-08-30 11:27:48 +02:00
|
|
|
/* CheckBBLivenessOnly */ true);
|
|
|
|
|
|
|
|
/// TODO: Figure out a way to avoid adding entry in
|
|
|
|
/// ICVReplacementValuesMap
|
|
|
|
Instruction *Entry = &F->getEntryBlock().front();
|
|
|
|
if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
|
|
|
|
ValuesMap.insert(std::make_pair(Entry, nullptr));
|
2020-07-11 01:06:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return HasChanged;
|
|
|
|
}
|
|
|
|
|
2020-08-30 11:27:48 +02:00
|
|
|
/// Hepler to check if \p I is a call and get the value for it if it is
|
|
|
|
/// unique.
|
|
|
|
Optional<Value *> getValueForCall(Attributor &A, const Instruction *I,
|
|
|
|
InternalControlVar &ICV) const {
|
|
|
|
|
|
|
|
const auto *CB = dyn_cast<CallBase>(I);
|
2020-11-24 20:38:55 +01:00
|
|
|
if (!CB || CB->hasFnAttr("no_openmp") ||
|
|
|
|
CB->hasFnAttr("no_openmp_routines"))
|
2020-08-30 11:27:48 +02:00
|
|
|
return None;
|
2020-07-11 01:06:46 +02:00
|
|
|
|
|
|
|
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
|
|
|
|
auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
|
2020-08-30 11:27:48 +02:00
|
|
|
auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
|
|
|
|
Function *CalledFunction = CB->getCalledFunction();
|
2020-07-11 01:06:46 +02:00
|
|
|
|
2020-09-04 18:05:13 +02:00
|
|
|
// Indirect call, assume ICV changes.
|
|
|
|
if (CalledFunction == nullptr)
|
|
|
|
return nullptr;
|
2020-08-30 11:27:48 +02:00
|
|
|
if (CalledFunction == GetterRFI.Declaration)
|
|
|
|
return None;
|
|
|
|
if (CalledFunction == SetterRFI.Declaration) {
|
|
|
|
if (ICVReplacementValuesMap[ICV].count(I))
|
|
|
|
return ICVReplacementValuesMap[ICV].lookup(I);
|
|
|
|
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Since we don't know, assume it changes the ICV.
|
|
|
|
if (CalledFunction->isDeclaration())
|
|
|
|
return nullptr;
|
|
|
|
|
2021-03-02 02:31:42 +01:00
|
|
|
const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
|
|
|
|
*this, IRPosition::callsite_returned(*CB), DepClassTy::REQUIRED);
|
2020-08-12 12:20:53 +02:00
|
|
|
|
2020-08-30 11:27:48 +02:00
|
|
|
if (ICVTrackingAA.isAssumedTracked())
|
|
|
|
return ICVTrackingAA.getUniqueReplacementValue(ICV);
|
|
|
|
|
|
|
|
// If we don't know, assume it changes.
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We don't check unique value for a function, so return None.
|
|
|
|
Optional<Value *>
|
|
|
|
getUniqueReplacementValue(InternalControlVar ICV) const override {
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Return the value with which \p I can be replaced for specific \p ICV.
|
|
|
|
Optional<Value *> getReplacementValue(InternalControlVar ICV,
|
|
|
|
const Instruction *I,
|
|
|
|
Attributor &A) const override {
|
|
|
|
const auto &ValuesMap = ICVReplacementValuesMap[ICV];
|
|
|
|
if (ValuesMap.count(I))
|
|
|
|
return ValuesMap.lookup(I);
|
|
|
|
|
|
|
|
SmallVector<const Instruction *, 16> Worklist;
|
|
|
|
SmallPtrSet<const Instruction *, 16> Visited;
|
|
|
|
Worklist.push_back(I);
|
|
|
|
|
|
|
|
Optional<Value *> ReplVal;
|
|
|
|
|
|
|
|
while (!Worklist.empty()) {
|
|
|
|
const Instruction *CurrInst = Worklist.pop_back_val();
|
|
|
|
if (!Visited.insert(CurrInst).second)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
const BasicBlock *CurrBB = CurrInst->getParent();
|
|
|
|
|
|
|
|
// Go up and look for all potential setters/calls that might change the
|
|
|
|
// ICV.
|
|
|
|
while ((CurrInst = CurrInst->getPrevNode())) {
|
|
|
|
if (ValuesMap.count(CurrInst)) {
|
|
|
|
Optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
|
|
|
|
// Unknown value, track new.
|
|
|
|
if (!ReplVal.hasValue()) {
|
|
|
|
ReplVal = NewReplVal;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we found a new value, we can't know the icv value anymore.
|
|
|
|
if (NewReplVal.hasValue())
|
|
|
|
if (ReplVal != NewReplVal)
|
2020-07-11 01:06:46 +02:00
|
|
|
return nullptr;
|
|
|
|
|
2020-08-30 11:27:48 +02:00
|
|
|
break;
|
2020-07-11 01:06:46 +02:00
|
|
|
}
|
|
|
|
|
2020-08-30 11:27:48 +02:00
|
|
|
Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV);
|
|
|
|
if (!NewReplVal.hasValue())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Unknown value, track new.
|
|
|
|
if (!ReplVal.hasValue()) {
|
|
|
|
ReplVal = NewReplVal;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// if (NewReplVal.hasValue())
|
|
|
|
// We found a new value, we can't know the icv value anymore.
|
|
|
|
if (ReplVal != NewReplVal)
|
|
|
|
return nullptr;
|
2020-07-11 01:06:46 +02:00
|
|
|
}
|
2020-08-30 11:27:48 +02:00
|
|
|
|
|
|
|
// If we are in the same BB and we have a value, we are done.
|
|
|
|
if (CurrBB == I->getParent() && ReplVal.hasValue())
|
|
|
|
return ReplVal;
|
|
|
|
|
|
|
|
// Go through all predecessors and add terminators for analysis.
|
|
|
|
for (const BasicBlock *Pred : predecessors(CurrBB))
|
|
|
|
if (const Instruction *Terminator = Pred->getTerminator())
|
|
|
|
Worklist.push_back(Terminator);
|
2020-07-11 01:06:46 +02:00
|
|
|
}
|
|
|
|
|
2020-08-30 11:27:48 +02:00
|
|
|
return ReplVal;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct AAICVTrackerFunctionReturned : AAICVTracker {
|
|
|
|
AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)
|
|
|
|
: AAICVTracker(IRP, A) {}
|
|
|
|
|
|
|
|
// FIXME: come up with better string.
|
|
|
|
const std::string getAsStr() const override {
|
|
|
|
return "ICVTrackerFunctionReturned";
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: come up with some stats.
|
|
|
|
void trackStatistics() const override {}
|
|
|
|
|
|
|
|
/// We don't manifest anything for this AA.
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
|
|
return ChangeStatus::UNCHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Map of ICV to their values at specific program point.
|
|
|
|
EnumeratedArray<Optional<Value *>, InternalControlVar,
|
|
|
|
InternalControlVar::ICV___last>
|
|
|
|
ICVReplacementValuesMap;
|
|
|
|
|
|
|
|
/// Return the value with which \p I can be replaced for specific \p ICV.
|
|
|
|
Optional<Value *>
|
|
|
|
getUniqueReplacementValue(InternalControlVar ICV) const override {
|
|
|
|
return ICVReplacementValuesMap[ICV];
|
|
|
|
}
|
|
|
|
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
|
|
ChangeStatus Changed = ChangeStatus::UNCHANGED;
|
|
|
|
const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
|
2021-03-02 02:31:42 +01:00
|
|
|
*this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
|
2020-08-30 11:27:48 +02:00
|
|
|
|
|
|
|
if (!ICVTrackingAA.isAssumedTracked())
|
|
|
|
return indicatePessimisticFixpoint();
|
|
|
|
|
|
|
|
for (InternalControlVar ICV : TrackableICVs) {
|
|
|
|
Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
|
|
|
|
Optional<Value *> UniqueICVValue;
|
|
|
|
|
|
|
|
auto CheckReturnInst = [&](Instruction &I) {
|
|
|
|
Optional<Value *> NewReplVal =
|
|
|
|
ICVTrackingAA.getReplacementValue(ICV, &I, A);
|
|
|
|
|
|
|
|
// If we found a second ICV value there is no unique returned value.
|
|
|
|
if (UniqueICVValue.hasValue() && UniqueICVValue != NewReplVal)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
UniqueICVValue = NewReplVal;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
2021-07-10 02:09:40 +02:00
|
|
|
bool UsedAssumedInformation = false;
|
2020-08-30 11:27:48 +02:00
|
|
|
if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret},
|
2021-07-10 02:09:40 +02:00
|
|
|
UsedAssumedInformation,
|
2020-08-30 11:27:48 +02:00
|
|
|
/* CheckBBLivenessOnly */ true))
|
|
|
|
UniqueICVValue = nullptr;
|
|
|
|
|
|
|
|
if (UniqueICVValue == ReplVal)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ReplVal = UniqueICVValue;
|
|
|
|
Changed = ChangeStatus::CHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct AAICVTrackerCallSite : AAICVTracker {
|
|
|
|
AAICVTrackerCallSite(const IRPosition &IRP, Attributor &A)
|
|
|
|
: AAICVTracker(IRP, A) {}
|
|
|
|
|
|
|
|
void initialize(Attributor &A) override {
|
|
|
|
Function *F = getAnchorScope();
|
|
|
|
if (!F || !A.isFunctionIPOAmendable(*F))
|
|
|
|
indicatePessimisticFixpoint();
|
|
|
|
|
|
|
|
// We only initialize this AA for getters, so we need to know which ICV it
|
|
|
|
// gets.
|
|
|
|
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
|
|
|
|
for (InternalControlVar ICV : TrackableICVs) {
|
|
|
|
auto ICVInfo = OMPInfoCache.ICVs[ICV];
|
|
|
|
auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
|
|
|
|
if (Getter.Declaration == getAssociatedFunction()) {
|
|
|
|
AssociatedICV = ICVInfo.Kind;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Unknown ICV.
|
|
|
|
indicatePessimisticFixpoint();
|
|
|
|
}
|
|
|
|
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
|
|
if (!ReplVal.hasValue() || !ReplVal.getValue())
|
|
|
|
return ChangeStatus::UNCHANGED;
|
|
|
|
|
|
|
|
A.changeValueAfterManifest(*getCtxI(), **ReplVal);
|
|
|
|
A.deleteAfterManifest(*getCtxI());
|
|
|
|
|
|
|
|
return ChangeStatus::CHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: come up with better string.
|
|
|
|
const std::string getAsStr() const override { return "ICVTrackerCallSite"; }
|
|
|
|
|
|
|
|
// FIXME: come up with some stats.
|
|
|
|
void trackStatistics() const override {}
|
|
|
|
|
|
|
|
InternalControlVar AssociatedICV;
|
|
|
|
Optional<Value *> ReplVal;
|
|
|
|
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
|
|
const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
|
2021-03-02 02:31:42 +01:00
|
|
|
*this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
|
2020-08-30 11:27:48 +02:00
|
|
|
|
|
|
|
// We don't have any information, so we assume it changes the ICV.
|
|
|
|
if (!ICVTrackingAA.isAssumedTracked())
|
|
|
|
return indicatePessimisticFixpoint();
|
|
|
|
|
|
|
|
Optional<Value *> NewReplVal =
|
|
|
|
ICVTrackingAA.getReplacementValue(AssociatedICV, getCtxI(), A);
|
|
|
|
|
|
|
|
if (ReplVal == NewReplVal)
|
|
|
|
return ChangeStatus::UNCHANGED;
|
|
|
|
|
|
|
|
ReplVal = NewReplVal;
|
|
|
|
return ChangeStatus::CHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the value with which associated value can be replaced for specific
|
|
|
|
// \p ICV.
|
|
|
|
Optional<Value *>
|
|
|
|
getUniqueReplacementValue(InternalControlVar ICV) const override {
|
|
|
|
return ReplVal;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
struct AAICVTrackerCallSiteReturned : AAICVTracker {
|
|
|
|
AAICVTrackerCallSiteReturned(const IRPosition &IRP, Attributor &A)
|
|
|
|
: AAICVTracker(IRP, A) {}
|
|
|
|
|
|
|
|
// FIXME: come up with better string.
|
|
|
|
const std::string getAsStr() const override {
|
|
|
|
return "ICVTrackerCallSiteReturned";
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: come up with some stats.
|
|
|
|
void trackStatistics() const override {}
|
|
|
|
|
|
|
|
/// We don't manifest anything for this AA.
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
|
|
return ChangeStatus::UNCHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Map of ICV to their values at specific program point.
|
|
|
|
EnumeratedArray<Optional<Value *>, InternalControlVar,
|
|
|
|
InternalControlVar::ICV___last>
|
|
|
|
ICVReplacementValuesMap;
|
|
|
|
|
|
|
|
/// Return the value with which associated value can be replaced for specific
|
|
|
|
/// \p ICV.
|
|
|
|
Optional<Value *>
|
|
|
|
getUniqueReplacementValue(InternalControlVar ICV) const override {
|
|
|
|
return ICVReplacementValuesMap[ICV];
|
|
|
|
}
|
|
|
|
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
|
|
ChangeStatus Changed = ChangeStatus::UNCHANGED;
|
|
|
|
const auto &ICVTrackingAA = A.getAAFor<AAICVTracker>(
|
2021-03-02 02:31:42 +01:00
|
|
|
*this, IRPosition::returned(*getAssociatedFunction()),
|
|
|
|
DepClassTy::REQUIRED);
|
2020-08-30 11:27:48 +02:00
|
|
|
|
|
|
|
// We don't have any information, so we assume it changes the ICV.
|
|
|
|
if (!ICVTrackingAA.isAssumedTracked())
|
|
|
|
return indicatePessimisticFixpoint();
|
|
|
|
|
|
|
|
for (InternalControlVar ICV : TrackableICVs) {
|
|
|
|
Optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
|
|
|
|
Optional<Value *> NewReplVal =
|
|
|
|
ICVTrackingAA.getUniqueReplacementValue(ICV);
|
|
|
|
|
|
|
|
if (ReplVal == NewReplVal)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ReplVal = NewReplVal;
|
|
|
|
Changed = ChangeStatus::CHANGED;
|
|
|
|
}
|
|
|
|
return Changed;
|
2020-07-11 01:06:46 +02:00
|
|
|
}
|
2019-11-07 06:20:06 +01:00
|
|
|
};
|
2021-04-28 22:22:53 +02:00
|
|
|
|
|
|
|
struct AAExecutionDomainFunction : public AAExecutionDomain {
|
|
|
|
AAExecutionDomainFunction(const IRPosition &IRP, Attributor &A)
|
|
|
|
: AAExecutionDomain(IRP, A) {}
|
|
|
|
|
|
|
|
const std::string getAsStr() const override {
|
|
|
|
return "[AAExecutionDomain] " + std::to_string(SingleThreadedBBs.size()) +
|
|
|
|
"/" + std::to_string(NumBBs) + " BBs thread 0 only.";
|
|
|
|
}
|
|
|
|
|
|
|
|
/// See AbstractAttribute::trackStatistics().
|
|
|
|
void trackStatistics() const override {}
|
|
|
|
|
|
|
|
void initialize(Attributor &A) override {
|
|
|
|
Function *F = getAnchorScope();
|
|
|
|
for (const auto &BB : *F)
|
|
|
|
SingleThreadedBBs.insert(&BB);
|
|
|
|
NumBBs = SingleThreadedBBs.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
|
|
LLVM_DEBUG({
|
|
|
|
for (const BasicBlock *BB : SingleThreadedBBs)
|
|
|
|
dbgs() << TAG << " Basic block @" << getAnchorScope()->getName() << " "
|
|
|
|
<< BB->getName() << " is executed by a single thread.\n";
|
|
|
|
});
|
|
|
|
return ChangeStatus::UNCHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override;
|
|
|
|
|
|
|
|
/// Check if an instruction is executed by a single thread.
|
2021-05-12 05:13:40 +02:00
|
|
|
bool isExecutedByInitialThreadOnly(const Instruction &I) const override {
|
|
|
|
return isExecutedByInitialThreadOnly(*I.getParent());
|
2021-04-28 22:22:53 +02:00
|
|
|
}
|
|
|
|
|
2021-05-12 05:13:40 +02:00
|
|
|
bool isExecutedByInitialThreadOnly(const BasicBlock &BB) const override {
|
2021-06-22 22:58:13 +02:00
|
|
|
return isValidState() && SingleThreadedBBs.contains(&BB);
|
2021-04-28 22:22:53 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Set of basic blocks that are executed by a single thread.
|
|
|
|
DenseSet<const BasicBlock *> SingleThreadedBBs;
|
|
|
|
|
|
|
|
/// Total number of basic blocks in this function.
|
|
|
|
long unsigned NumBBs;
|
|
|
|
};
|
|
|
|
|
|
|
|
ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
|
|
|
|
Function *F = getAnchorScope();
|
|
|
|
ReversePostOrderTraversal<Function *> RPOT(F);
|
|
|
|
auto NumSingleThreadedBBs = SingleThreadedBBs.size();
|
|
|
|
|
|
|
|
bool AllCallSitesKnown;
|
|
|
|
auto PredForCallSite = [&](AbstractCallSite ACS) {
|
|
|
|
const auto &ExecutionDomainAA = A.getAAFor<AAExecutionDomain>(
|
|
|
|
*this, IRPosition::function(*ACS.getInstruction()->getFunction()),
|
|
|
|
DepClassTy::REQUIRED);
|
2021-06-22 22:58:13 +02:00
|
|
|
return ACS.isDirectCall() &&
|
|
|
|
ExecutionDomainAA.isExecutedByInitialThreadOnly(
|
|
|
|
*ACS.getInstruction());
|
2021-04-28 22:22:53 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
if (!A.checkForAllCallSites(PredForCallSite, *this,
|
|
|
|
/* RequiresAllCallSites */ true,
|
|
|
|
AllCallSitesKnown))
|
|
|
|
SingleThreadedBBs.erase(&F->getEntryBlock());
|
|
|
|
|
2021-06-17 18:23:20 +02:00
|
|
|
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
|
|
|
|
auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
|
|
|
|
|
|
|
|
// Check if the edge into the successor block compares the __kmpc_target_init
|
|
|
|
// result with -1. If we are in non-SPMD-mode that signals only the main
|
|
|
|
// thread will execute the edge.
|
2021-03-22 21:35:55 +01:00
|
|
|
auto IsInitialThreadOnly = [&](BranchInst *Edge, BasicBlock *SuccessorBB) {
|
2021-04-28 22:22:53 +02:00
|
|
|
if (!Edge || !Edge->isConditional())
|
|
|
|
return false;
|
|
|
|
if (Edge->getSuccessor(0) != SuccessorBB)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
auto *Cmp = dyn_cast<CmpInst>(Edge->getCondition());
|
|
|
|
if (!Cmp || !Cmp->isTrueWhenEqual() || !Cmp->isEquality())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
ConstantInt *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
|
2021-06-17 18:23:20 +02:00
|
|
|
if (!C)
|
2021-04-28 22:22:53 +02:00
|
|
|
return false;
|
|
|
|
|
2021-06-17 18:23:20 +02:00
|
|
|
// Match: -1 == __kmpc_target_init (for non-SPMD kernels only!)
|
|
|
|
if (C->isAllOnesValue()) {
|
|
|
|
auto *CB = dyn_cast<CallBase>(Cmp->getOperand(0));
|
|
|
|
if (!CB || CB->getCalledFunction() != RFI.Declaration)
|
|
|
|
return false;
|
|
|
|
const int InitIsSPMDArgNo = 1;
|
|
|
|
auto *IsSPMDModeCI =
|
|
|
|
dyn_cast<ConstantInt>(CB->getOperand(InitIsSPMDArgNo));
|
|
|
|
return IsSPMDModeCI && IsSPMDModeCI->isZero();
|
|
|
|
}
|
2021-04-28 22:22:53 +02:00
|
|
|
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Merge all the predecessor states into the current basic block. A basic
|
|
|
|
// block is executed by a single thread if all of its predecessors are.
|
|
|
|
auto MergePredecessorStates = [&](BasicBlock *BB) {
|
|
|
|
if (pred_begin(BB) == pred_end(BB))
|
|
|
|
return SingleThreadedBBs.contains(BB);
|
|
|
|
|
2021-03-22 21:35:55 +01:00
|
|
|
bool IsInitialThread = true;
|
2021-04-28 22:22:53 +02:00
|
|
|
for (auto PredBB = pred_begin(BB), PredEndBB = pred_end(BB);
|
|
|
|
PredBB != PredEndBB; ++PredBB) {
|
2021-03-22 21:35:55 +01:00
|
|
|
if (!IsInitialThreadOnly(dyn_cast<BranchInst>((*PredBB)->getTerminator()),
|
2021-06-17 18:23:20 +02:00
|
|
|
BB))
|
2021-03-22 21:35:55 +01:00
|
|
|
IsInitialThread &= SingleThreadedBBs.contains(*PredBB);
|
2021-04-28 22:22:53 +02:00
|
|
|
}
|
|
|
|
|
2021-03-22 21:35:55 +01:00
|
|
|
return IsInitialThread;
|
2021-04-28 22:22:53 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
for (auto *BB : RPOT) {
|
|
|
|
if (!MergePredecessorStates(BB))
|
|
|
|
SingleThreadedBBs.erase(BB);
|
|
|
|
}
|
|
|
|
|
|
|
|
return (NumSingleThreadedBBs == SingleThreadedBBs.size())
|
|
|
|
? ChangeStatus::UNCHANGED
|
|
|
|
: ChangeStatus::CHANGED;
|
|
|
|
}
|
|
|
|
|
2021-03-22 21:35:55 +01:00
|
|
|
/// Try to replace memory allocation calls called by a single thread with a
|
|
|
|
/// static buffer of shared memory.
|
|
|
|
struct AAHeapToShared : public StateWrapper<BooleanState, AbstractAttribute> {
|
|
|
|
using Base = StateWrapper<BooleanState, AbstractAttribute>;
|
|
|
|
AAHeapToShared(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
|
|
|
|
|
|
|
|
/// Create an abstract attribute view for the position \p IRP.
|
|
|
|
static AAHeapToShared &createForPosition(const IRPosition &IRP,
|
|
|
|
Attributor &A);
|
|
|
|
|
|
|
|
/// See AbstractAttribute::getName().
|
|
|
|
const std::string getName() const override { return "AAHeapToShared"; }
|
|
|
|
|
|
|
|
/// See AbstractAttribute::getIdAddr().
|
|
|
|
const char *getIdAddr() const override { return &ID; }
|
|
|
|
|
|
|
|
/// This function should return true if the type of the \p AA is
|
|
|
|
/// AAHeapToShared.
|
|
|
|
static bool classof(const AbstractAttribute *AA) {
|
|
|
|
return (AA->getIdAddr() == &ID);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Unique ID (due to the unique address)
|
|
|
|
static const char ID;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct AAHeapToSharedFunction : public AAHeapToShared {
|
|
|
|
AAHeapToSharedFunction(const IRPosition &IRP, Attributor &A)
|
|
|
|
: AAHeapToShared(IRP, A) {}
|
|
|
|
|
|
|
|
const std::string getAsStr() const override {
|
|
|
|
return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
|
|
|
|
" malloc calls eligible.";
|
|
|
|
}
|
|
|
|
|
|
|
|
/// See AbstractAttribute::trackStatistics().
|
|
|
|
void trackStatistics() const override {}
|
|
|
|
|
|
|
|
void initialize(Attributor &A) override {
|
|
|
|
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
|
|
|
|
auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
|
|
|
|
|
|
|
|
for (User *U : RFI.Declaration->users())
|
|
|
|
if (CallBase *CB = dyn_cast<CallBase>(U))
|
|
|
|
MallocCalls.insert(CB);
|
|
|
|
}
|
|
|
|
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
|
|
if (MallocCalls.empty())
|
|
|
|
return ChangeStatus::UNCHANGED;
|
|
|
|
|
|
|
|
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
|
|
|
|
auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
|
|
|
|
|
|
|
|
Function *F = getAnchorScope();
|
|
|
|
auto *HS = A.lookupAAFor<AAHeapToStack>(IRPosition::function(*F), this,
|
|
|
|
DepClassTy::OPTIONAL);
|
|
|
|
|
|
|
|
ChangeStatus Changed = ChangeStatus::UNCHANGED;
|
|
|
|
for (CallBase *CB : MallocCalls) {
|
|
|
|
// Skip replacing this if HeapToStack has already claimed it.
|
[Attributor] Reorganize AAHeapToStack
In order to simplify future extensions, e.g., the merge of
AAHeapToShared in to AAHeapToStack, we reorganize AAHeapToStack and the
state we keep for each malloc-like call. The result is also less
confusing as we only track malloc-like calls, not all calls. Further, we
only perform the updates necessary for a malloc-like to argue it can go
to the stack, e.g., we won't check all uses if we moved on to the
"must-be-freed" argument.
This patch also uses Attributor helps to simplify the allocated size,
alignment, and the potentially freed objects.
Overall, this is mostly a reorganization and only the use of the
optimistic helpers should change (=improve) the capabilities a bit.
Differential Revision: https://reviews.llvm.org/D104993
2021-06-26 01:24:01 +02:00
|
|
|
if (HS && HS->isAssumedHeapToStack(*CB))
|
2021-03-22 21:35:55 +01:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// Find the unique free call to remove it.
|
|
|
|
SmallVector<CallBase *, 4> FreeCalls;
|
|
|
|
for (auto *U : CB->users()) {
|
|
|
|
CallBase *C = dyn_cast<CallBase>(U);
|
|
|
|
if (C && C->getCalledFunction() == FreeCall.Declaration)
|
|
|
|
FreeCalls.push_back(C);
|
|
|
|
}
|
|
|
|
if (FreeCalls.size() != 1)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0));
|
|
|
|
|
|
|
|
LLVM_DEBUG(dbgs() << TAG << "Replace globalization call in "
|
|
|
|
<< CB->getCaller()->getName() << " with "
|
|
|
|
<< AllocSize->getZExtValue()
|
|
|
|
<< " bytes of shared memory\n");
|
|
|
|
|
|
|
|
// Create a new shared memory buffer of the same size as the allocation
|
|
|
|
// and replace all the uses of the original allocation with it.
|
|
|
|
Module *M = CB->getModule();
|
|
|
|
Type *Int8Ty = Type::getInt8Ty(M->getContext());
|
|
|
|
Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
|
|
|
|
auto *SharedMem = new GlobalVariable(
|
|
|
|
*M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage,
|
|
|
|
UndefValue::get(Int8ArrTy), CB->getName(), nullptr,
|
|
|
|
GlobalValue::NotThreadLocal,
|
|
|
|
static_cast<unsigned>(AddressSpace::Shared));
|
|
|
|
auto *NewBuffer =
|
|
|
|
ConstantExpr::getPointerCast(SharedMem, Int8Ty->getPointerTo());
|
|
|
|
|
2021-06-07 20:31:40 +02:00
|
|
|
auto Remark = [&](OptimizationRemark OR) {
|
|
|
|
return OR << "Replaced globalized variable with "
|
|
|
|
<< ore::NV("SharedMemory", AllocSize->getZExtValue())
|
|
|
|
<< ((AllocSize->getZExtValue() != 1) ? " bytes " : " byte ")
|
2021-07-13 16:01:21 +02:00
|
|
|
<< "of shared memory.";
|
2021-06-07 20:31:40 +02:00
|
|
|
};
|
2021-07-13 21:31:44 +02:00
|
|
|
A.emitRemark<OptimizationRemark>(CB, "OMP111", Remark);
|
2021-06-07 20:31:40 +02:00
|
|
|
|
2021-03-22 21:35:55 +01:00
|
|
|
SharedMem->setAlignment(MaybeAlign(32));
|
|
|
|
|
|
|
|
A.changeValueAfterManifest(*CB, *NewBuffer);
|
|
|
|
A.deleteAfterManifest(*CB);
|
|
|
|
A.deleteAfterManifest(*FreeCalls.front());
|
|
|
|
|
|
|
|
NumBytesMovedToSharedMemory += AllocSize->getZExtValue();
|
|
|
|
Changed = ChangeStatus::CHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
|
|
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
|
|
|
|
auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
|
|
|
|
Function *F = getAnchorScope();
|
|
|
|
|
|
|
|
auto NumMallocCalls = MallocCalls.size();
|
|
|
|
|
|
|
|
// Only consider malloc calls executed by a single thread with a constant.
|
|
|
|
for (User *U : RFI.Declaration->users()) {
|
|
|
|
const auto &ED = A.getAAFor<AAExecutionDomain>(
|
|
|
|
*this, IRPosition::function(*F), DepClassTy::REQUIRED);
|
|
|
|
if (CallBase *CB = dyn_cast<CallBase>(U))
|
|
|
|
if (!dyn_cast<ConstantInt>(CB->getArgOperand(0)) ||
|
|
|
|
!ED.isExecutedByInitialThreadOnly(*CB))
|
|
|
|
MallocCalls.erase(CB);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (NumMallocCalls != MallocCalls.size())
|
|
|
|
return ChangeStatus::CHANGED;
|
|
|
|
|
|
|
|
return ChangeStatus::UNCHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Collection of all malloc calls in a function.
|
|
|
|
SmallPtrSet<CallBase *, 4> MallocCalls;
|
|
|
|
};
|
|
|
|
|
2021-05-20 07:37:29 +02:00
|
|
|
struct AAKernelInfo : public StateWrapper<KernelInfoState, AbstractAttribute> {
|
|
|
|
using Base = StateWrapper<KernelInfoState, AbstractAttribute>;
|
|
|
|
AAKernelInfo(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
|
|
|
|
|
|
|
|
/// Statistics are tracked as part of manifest for now.
|
|
|
|
void trackStatistics() const override {}
|
|
|
|
|
|
|
|
/// See AbstractAttribute::getAsStr()
|
|
|
|
const std::string getAsStr() const override {
|
|
|
|
if (!isValidState())
|
|
|
|
return "<invalid>";
|
2021-06-23 23:33:49 +02:00
|
|
|
return std::string(SPMDCompatibilityTracker.isAssumed() ? "SPMD"
|
|
|
|
: "generic") +
|
|
|
|
std::string(SPMDCompatibilityTracker.isAtFixpoint() ? " [FIX]"
|
|
|
|
: "") +
|
2021-05-20 07:37:29 +02:00
|
|
|
std::string(" #PRs: ") +
|
|
|
|
std::to_string(ReachedKnownParallelRegions.size()) +
|
|
|
|
", #Unknown PRs: " +
|
|
|
|
std::to_string(ReachedUnknownParallelRegions.size());
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Create an abstract attribute biew for the position \p IRP.
|
|
|
|
static AAKernelInfo &createForPosition(const IRPosition &IRP, Attributor &A);
|
|
|
|
|
|
|
|
/// See AbstractAttribute::getName()
|
|
|
|
const std::string getName() const override { return "AAKernelInfo"; }
|
|
|
|
|
|
|
|
/// See AbstractAttribute::getIdAddr()
|
|
|
|
const char *getIdAddr() const override { return &ID; }
|
|
|
|
|
|
|
|
/// This function should return true if the type of the \p AA is AAKernelInfo
|
|
|
|
static bool classof(const AbstractAttribute *AA) {
|
|
|
|
return (AA->getIdAddr() == &ID);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char ID;
|
|
|
|
};
|
|
|
|
|
|
|
|
/// The function kernel info abstract attribute, basically, what can we say
|
|
|
|
/// about a function with regards to the KernelInfoState.
|
|
|
|
struct AAKernelInfoFunction : AAKernelInfo {
|
|
|
|
AAKernelInfoFunction(const IRPosition &IRP, Attributor &A)
|
|
|
|
: AAKernelInfo(IRP, A) {}
|
|
|
|
|
|
|
|
/// See AbstractAttribute::initialize(...).
|
|
|
|
void initialize(Attributor &A) override {
|
|
|
|
// This is a high-level transform that might change the constant arguments
|
|
|
|
// of the init and dinit calls. We need to tell the Attributor about this
|
|
|
|
// to avoid other parts using the current constant value for simpliication.
|
|
|
|
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
|
|
|
|
|
|
|
|
Function *Fn = getAnchorScope();
|
|
|
|
if (!OMPInfoCache.Kernels.count(Fn))
|
|
|
|
return;
|
|
|
|
|
[AbstractAttributor] Fold function calls to `__kmpc_is_spmd_exec_mode` if possible
In the device runtime there are many function calls to `__kmpc_is_spmd_exec_mode`
to query the execution mode of current kernels. In many cases, user programs
only contain target region executing in one mode. As a consequence, those runtime
function calls will only return one value. If we can get rid of these function
calls during compliation, it can potentially improve performance.
In this patch, we use `AAKernelInfo` to analyze kernel execution. Basically, for
each kernel (device) function `F`, we collect all kernel entries `K` that can
reach `F`. A new AA, `AAFoldRuntimeCall`, is created for each call site. In each
iteration, it will check all reaching kernel entries, and update the folded value
accordingly.
In the future we will support more function.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D105787
2021-07-16 00:23:12 +02:00
|
|
|
// Add itself to the reaching kernel and set IsKernelEntry.
|
|
|
|
ReachingKernelEntries.insert(Fn);
|
|
|
|
IsKernelEntry = true;
|
|
|
|
|
2021-05-20 07:37:29 +02:00
|
|
|
OMPInformationCache::RuntimeFunctionInfo &InitRFI =
|
|
|
|
OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
|
|
|
|
OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
|
|
|
|
OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
|
|
|
|
|
|
|
|
// For kernels we perform more initialization work, first we find the init
|
|
|
|
// and deinit calls.
|
|
|
|
auto StoreCallBase = [](Use &U,
|
|
|
|
OMPInformationCache::RuntimeFunctionInfo &RFI,
|
|
|
|
CallBase *&Storage) {
|
|
|
|
CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
|
|
|
|
assert(CB &&
|
|
|
|
"Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
|
|
|
|
assert(!Storage &&
|
|
|
|
"Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
|
|
|
|
Storage = CB;
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
InitRFI.foreachUse(
|
|
|
|
[&](Use &U, Function &) {
|
|
|
|
StoreCallBase(U, InitRFI, KernelInitCB);
|
|
|
|
return false;
|
|
|
|
},
|
|
|
|
Fn);
|
|
|
|
DeinitRFI.foreachUse(
|
|
|
|
[&](Use &U, Function &) {
|
|
|
|
StoreCallBase(U, DeinitRFI, KernelDeinitCB);
|
|
|
|
return false;
|
|
|
|
},
|
|
|
|
Fn);
|
|
|
|
|
|
|
|
assert((KernelInitCB && KernelDeinitCB) &&
|
|
|
|
"Kernel without __kmpc_target_init or __kmpc_target_deinit!");
|
|
|
|
|
2021-06-23 23:33:49 +02:00
|
|
|
// For kernels we might need to initialize/finalize the IsSPMD state and
|
|
|
|
// we need to register a simplification callback so that the Attributor
|
|
|
|
// knows the constant arguments to __kmpc_target_init and
|
2021-05-20 07:37:29 +02:00
|
|
|
// __kmpc_target_deinit might actually change.
|
|
|
|
|
|
|
|
Attributor::SimplifictionCallbackTy StateMachineSimplifyCB =
|
|
|
|
[&](const IRPosition &IRP, const AbstractAttribute *AA,
|
|
|
|
bool &UsedAssumedInformation) -> Optional<Value *> {
|
|
|
|
// IRP represents the "use generic state machine" argument of an
|
|
|
|
// __kmpc_target_init call. We will answer this one with the internal
|
|
|
|
// state. As long as we are not in an invalid state, we will create a
|
|
|
|
// custom state machine so the value should be a `i1 false`. If we are
|
|
|
|
// in an invalid state, we won't change the value that is in the IR.
|
|
|
|
if (!isValidState())
|
|
|
|
return nullptr;
|
|
|
|
if (AA)
|
|
|
|
A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
|
|
|
|
UsedAssumedInformation = !isAtFixpoint();
|
|
|
|
auto *FalseVal =
|
|
|
|
ConstantInt::getBool(IRP.getAnchorValue().getContext(), 0);
|
|
|
|
return FalseVal;
|
|
|
|
};
|
|
|
|
|
2021-06-23 23:33:49 +02:00
|
|
|
Attributor::SimplifictionCallbackTy IsSPMDModeSimplifyCB =
|
|
|
|
[&](const IRPosition &IRP, const AbstractAttribute *AA,
|
|
|
|
bool &UsedAssumedInformation) -> Optional<Value *> {
|
|
|
|
// IRP represents the "SPMDCompatibilityTracker" argument of an
|
|
|
|
// __kmpc_target_init or
|
|
|
|
// __kmpc_target_deinit call. We will answer this one with the internal
|
|
|
|
// state.
|
|
|
|
if (!isValidState())
|
|
|
|
return nullptr;
|
|
|
|
if (!SPMDCompatibilityTracker.isAtFixpoint()) {
|
|
|
|
if (AA)
|
|
|
|
A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
|
|
|
|
UsedAssumedInformation = true;
|
|
|
|
} else {
|
|
|
|
UsedAssumedInformation = false;
|
|
|
|
}
|
|
|
|
auto *Val = ConstantInt::getBool(IRP.getAnchorValue().getContext(),
|
|
|
|
SPMDCompatibilityTracker.isAssumed());
|
|
|
|
return Val;
|
|
|
|
};
|
|
|
|
|
|
|
|
constexpr const int InitIsSPMDArgNo = 1;
|
|
|
|
constexpr const int DeinitIsSPMDArgNo = 1;
|
2021-05-20 07:37:29 +02:00
|
|
|
constexpr const int InitUseStateMachineArgNo = 2;
|
|
|
|
A.registerSimplificationCallback(
|
|
|
|
IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo),
|
|
|
|
StateMachineSimplifyCB);
|
2021-06-23 23:33:49 +02:00
|
|
|
A.registerSimplificationCallback(
|
|
|
|
IRPosition::callsite_argument(*KernelInitCB, InitIsSPMDArgNo),
|
|
|
|
IsSPMDModeSimplifyCB);
|
|
|
|
A.registerSimplificationCallback(
|
|
|
|
IRPosition::callsite_argument(*KernelDeinitCB, DeinitIsSPMDArgNo),
|
|
|
|
IsSPMDModeSimplifyCB);
|
|
|
|
|
|
|
|
// Check if we know we are in SPMD-mode already.
|
|
|
|
ConstantInt *IsSPMDArg =
|
|
|
|
dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo));
|
|
|
|
if (IsSPMDArg && !IsSPMDArg->isZero())
|
|
|
|
SPMDCompatibilityTracker.indicateOptimisticFixpoint();
|
2021-05-20 07:37:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Modify the IR based on the KernelInfoState as the fixpoint iteration is
|
|
|
|
/// finished now.
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
|
|
// If we are not looking at a kernel with __kmpc_target_init and
|
|
|
|
// __kmpc_target_deinit call we cannot actually manifest the information.
|
|
|
|
if (!KernelInitCB || !KernelDeinitCB)
|
|
|
|
return ChangeStatus::UNCHANGED;
|
|
|
|
|
2021-06-23 23:33:49 +02:00
|
|
|
// Known SPMD-mode kernels need no manifest changes.
|
|
|
|
if (SPMDCompatibilityTracker.isKnown())
|
|
|
|
return ChangeStatus::UNCHANGED;
|
|
|
|
|
|
|
|
// If we can we change the execution mode to SPMD-mode otherwise we build a
|
|
|
|
// custom state machine.
|
|
|
|
if (!changeToSPMDMode(A))
|
|
|
|
buildCustomStateMachine(A);
|
2021-05-20 07:37:29 +02:00
|
|
|
|
|
|
|
return ChangeStatus::CHANGED;
|
|
|
|
}
|
|
|
|
|
2021-06-23 23:33:49 +02:00
|
|
|
bool changeToSPMDMode(Attributor &A) {
|
2021-07-13 16:01:21 +02:00
|
|
|
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
|
|
|
|
|
2021-06-23 23:33:49 +02:00
|
|
|
if (!SPMDCompatibilityTracker.isAssumed()) {
|
|
|
|
for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
|
|
|
|
if (!NonCompatibleI)
|
|
|
|
continue;
|
2021-07-13 16:01:21 +02:00
|
|
|
|
|
|
|
// Skip diagnostics on calls to known OpenMP runtime functions for now.
|
|
|
|
if (auto *CB = dyn_cast<CallBase>(NonCompatibleI))
|
|
|
|
if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
|
|
|
|
continue;
|
|
|
|
|
2021-06-23 23:33:49 +02:00
|
|
|
auto Remark = [&](OptimizationRemarkAnalysis ORA) {
|
2021-07-13 16:01:21 +02:00
|
|
|
ORA << "Value has potential side effects preventing SPMD-mode "
|
|
|
|
"execution";
|
|
|
|
if (isa<CallBase>(NonCompatibleI)) {
|
|
|
|
ORA << ". Add `__attribute__((assume(\"ompx_spmd_amenable\")))` to "
|
|
|
|
"the called function to override";
|
2021-06-23 23:33:49 +02:00
|
|
|
}
|
|
|
|
return ORA << ".";
|
|
|
|
};
|
2021-07-13 21:31:44 +02:00
|
|
|
A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI, "OMP121",
|
|
|
|
Remark);
|
2021-06-23 23:33:49 +02:00
|
|
|
|
|
|
|
LLVM_DEBUG(dbgs() << TAG << "SPMD-incompatible side-effect: "
|
|
|
|
<< *NonCompatibleI << "\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Adjust the global exec mode flag that tells the runtime what mode this
|
|
|
|
// kernel is executed in.
|
|
|
|
Function *Kernel = getAnchorScope();
|
|
|
|
GlobalVariable *ExecMode = Kernel->getParent()->getGlobalVariable(
|
|
|
|
(Kernel->getName() + "_exec_mode").str());
|
|
|
|
assert(ExecMode && "Kernel without exec mode?");
|
|
|
|
assert(ExecMode->getInitializer() &&
|
|
|
|
ExecMode->getInitializer()->isOneValue() &&
|
|
|
|
"Initially non-SPMD kernel has SPMD exec mode!");
|
|
|
|
ExecMode->setInitializer(
|
|
|
|
ConstantInt::get(ExecMode->getInitializer()->getType(), 0));
|
|
|
|
|
|
|
|
// Next rewrite the init and deinit calls to indicate we use SPMD-mode now.
|
|
|
|
const int InitIsSPMDArgNo = 1;
|
|
|
|
const int DeinitIsSPMDArgNo = 1;
|
|
|
|
const int InitUseStateMachineArgNo = 2;
|
|
|
|
|
|
|
|
auto &Ctx = getAnchorValue().getContext();
|
|
|
|
A.changeUseAfterManifest(KernelInitCB->getArgOperandUse(InitIsSPMDArgNo),
|
|
|
|
*ConstantInt::getBool(Ctx, 1));
|
|
|
|
A.changeUseAfterManifest(
|
|
|
|
KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo),
|
|
|
|
*ConstantInt::getBool(Ctx, 0));
|
|
|
|
A.changeUseAfterManifest(
|
|
|
|
KernelDeinitCB->getArgOperandUse(DeinitIsSPMDArgNo),
|
|
|
|
*ConstantInt::getBool(Ctx, 1));
|
|
|
|
++NumOpenMPTargetRegionKernelsSPMD;
|
|
|
|
|
|
|
|
auto Remark = [&](OptimizationRemark OR) {
|
2021-07-13 16:01:21 +02:00
|
|
|
return OR << "Transformed generic-mode kernel to SPMD-mode.";
|
2021-06-23 23:33:49 +02:00
|
|
|
};
|
2021-07-13 21:31:44 +02:00
|
|
|
A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP120", Remark);
|
2021-06-23 23:33:49 +02:00
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
2021-05-20 07:37:29 +02:00
|
|
|
ChangeStatus buildCustomStateMachine(Attributor &A) {
|
|
|
|
assert(ReachedKnownParallelRegions.isValidState() &&
|
|
|
|
"Custom state machine with invalid parallel region states?");
|
|
|
|
|
|
|
|
const int InitIsSPMDArgNo = 1;
|
|
|
|
const int InitUseStateMachineArgNo = 2;
|
|
|
|
|
|
|
|
// Check if the current configuration is non-SPMD and generic state machine.
|
|
|
|
// If we already have SPMD mode or a custom state machine we do not need to
|
|
|
|
// go any further. If it is anything but a constant something is weird and
|
|
|
|
// we give up.
|
|
|
|
ConstantInt *UseStateMachine = dyn_cast<ConstantInt>(
|
|
|
|
KernelInitCB->getArgOperand(InitUseStateMachineArgNo));
|
|
|
|
ConstantInt *IsSPMD =
|
|
|
|
dyn_cast<ConstantInt>(KernelInitCB->getArgOperand(InitIsSPMDArgNo));
|
|
|
|
|
|
|
|
// If we are stuck with generic mode, try to create a custom device (=GPU)
|
|
|
|
// state machine which is specialized for the parallel regions that are
|
|
|
|
// reachable by the kernel.
|
|
|
|
if (!UseStateMachine || UseStateMachine->isZero() || !IsSPMD ||
|
|
|
|
!IsSPMD->isZero())
|
|
|
|
return ChangeStatus::UNCHANGED;
|
|
|
|
|
2021-06-23 23:33:49 +02:00
|
|
|
// If not SPMD mode, indicate we use a custom state machine now.
|
2021-05-20 07:37:29 +02:00
|
|
|
auto &Ctx = getAnchorValue().getContext();
|
|
|
|
auto *FalseVal = ConstantInt::getBool(Ctx, 0);
|
|
|
|
A.changeUseAfterManifest(
|
|
|
|
KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *FalseVal);
|
|
|
|
|
|
|
|
// If we don't actually need a state machine we are done here. This can
|
|
|
|
// happen if there simply are no parallel regions. In the resulting kernel
|
|
|
|
// all worker threads will simply exit right away, leaving the main thread
|
|
|
|
// to do the work alone.
|
|
|
|
if (ReachedKnownParallelRegions.empty() &&
|
|
|
|
ReachedUnknownParallelRegions.empty()) {
|
|
|
|
++NumOpenMPTargetRegionKernelsWithoutStateMachine;
|
|
|
|
|
|
|
|
auto Remark = [&](OptimizationRemark OR) {
|
2021-07-13 16:01:21 +02:00
|
|
|
return OR << "Removing unused state machine from generic-mode kernel.";
|
2021-05-20 07:37:29 +02:00
|
|
|
};
|
2021-07-13 21:31:44 +02:00
|
|
|
A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP130", Remark);
|
2021-05-20 07:37:29 +02:00
|
|
|
|
|
|
|
return ChangeStatus::CHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Keep track in the statistics of our new shiny custom state machine.
|
|
|
|
if (ReachedUnknownParallelRegions.empty()) {
|
|
|
|
++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
|
|
|
|
|
|
|
|
auto Remark = [&](OptimizationRemark OR) {
|
2021-07-13 16:01:21 +02:00
|
|
|
return OR << "Rewriting generic-mode kernel with a customized state "
|
|
|
|
"machine.";
|
2021-05-20 07:37:29 +02:00
|
|
|
};
|
2021-07-13 21:31:44 +02:00
|
|
|
A.emitRemark<OptimizationRemark>(KernelInitCB, "OMP131", Remark);
|
2021-05-20 07:37:29 +02:00
|
|
|
} else {
|
|
|
|
++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
|
|
|
|
|
2021-07-13 16:01:21 +02:00
|
|
|
auto Remark = [&](OptimizationRemarkAnalysis OR) {
|
2021-05-20 07:37:29 +02:00
|
|
|
return OR << "Generic-mode kernel is executed with a customized state "
|
2021-07-13 16:01:21 +02:00
|
|
|
"machine that requires a fallback.";
|
2021-05-20 07:37:29 +02:00
|
|
|
};
|
2021-07-13 21:31:44 +02:00
|
|
|
A.emitRemark<OptimizationRemarkAnalysis>(KernelInitCB, "OMP132", Remark);
|
2021-05-20 07:37:29 +02:00
|
|
|
|
|
|
|
// Tell the user why we ended up with a fallback.
|
|
|
|
for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
|
|
|
|
if (!UnknownParallelRegionCB)
|
|
|
|
continue;
|
|
|
|
auto Remark = [&](OptimizationRemarkAnalysis ORA) {
|
2021-07-13 16:01:21 +02:00
|
|
|
return ORA << "Call may contain unknown parallel regions. Use "
|
|
|
|
<< "`__attribute__((assume(\"omp_no_parallelism\")))` to "
|
|
|
|
"override.";
|
2021-05-20 07:37:29 +02:00
|
|
|
};
|
2021-07-13 21:31:44 +02:00
|
|
|
A.emitRemark<OptimizationRemarkAnalysis>(UnknownParallelRegionCB,
|
|
|
|
"OMP133", Remark);
|
2021-05-20 07:37:29 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create all the blocks:
|
|
|
|
//
|
|
|
|
// InitCB = __kmpc_target_init(...)
|
|
|
|
// bool IsWorker = InitCB >= 0;
|
|
|
|
// if (IsWorker) {
|
|
|
|
// SMBeginBB: __kmpc_barrier_simple_spmd(...);
|
|
|
|
// void *WorkFn;
|
|
|
|
// bool Active = __kmpc_kernel_parallel(&WorkFn);
|
|
|
|
// if (!WorkFn) return;
|
|
|
|
// SMIsActiveCheckBB: if (Active) {
|
|
|
|
// SMIfCascadeCurrentBB: if (WorkFn == <ParFn0>)
|
|
|
|
// ParFn0(...);
|
|
|
|
// SMIfCascadeCurrentBB: else if (WorkFn == <ParFn1>)
|
|
|
|
// ParFn1(...);
|
|
|
|
// ...
|
|
|
|
// SMIfCascadeCurrentBB: else
|
|
|
|
// ((WorkFnTy*)WorkFn)(...);
|
|
|
|
// SMEndParallelBB: __kmpc_kernel_end_parallel(...);
|
|
|
|
// }
|
|
|
|
// SMDoneBB: __kmpc_barrier_simple_spmd(...);
|
|
|
|
// goto SMBeginBB;
|
|
|
|
// }
|
|
|
|
// UserCodeEntryBB: // user code
|
|
|
|
// __kmpc_target_deinit(...)
|
|
|
|
//
|
|
|
|
Function *Kernel = getAssociatedFunction();
|
|
|
|
assert(Kernel && "Expected an associated function!");
|
|
|
|
|
|
|
|
BasicBlock *InitBB = KernelInitCB->getParent();
|
|
|
|
BasicBlock *UserCodeEntryBB = InitBB->splitBasicBlock(
|
|
|
|
KernelInitCB->getNextNode(), "thread.user_code.check");
|
|
|
|
BasicBlock *StateMachineBeginBB = BasicBlock::Create(
|
|
|
|
Ctx, "worker_state_machine.begin", Kernel, UserCodeEntryBB);
|
|
|
|
BasicBlock *StateMachineFinishedBB = BasicBlock::Create(
|
|
|
|
Ctx, "worker_state_machine.finished", Kernel, UserCodeEntryBB);
|
|
|
|
BasicBlock *StateMachineIsActiveCheckBB = BasicBlock::Create(
|
|
|
|
Ctx, "worker_state_machine.is_active.check", Kernel, UserCodeEntryBB);
|
|
|
|
BasicBlock *StateMachineIfCascadeCurrentBB =
|
|
|
|
BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
|
|
|
|
Kernel, UserCodeEntryBB);
|
|
|
|
BasicBlock *StateMachineEndParallelBB =
|
|
|
|
BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.end",
|
|
|
|
Kernel, UserCodeEntryBB);
|
|
|
|
BasicBlock *StateMachineDoneBarrierBB = BasicBlock::Create(
|
|
|
|
Ctx, "worker_state_machine.done.barrier", Kernel, UserCodeEntryBB);
|
|
|
|
|
|
|
|
const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
|
|
|
|
ReturnInst::Create(Ctx, StateMachineFinishedBB)->setDebugLoc(DLoc);
|
|
|
|
|
|
|
|
InitBB->getTerminator()->eraseFromParent();
|
|
|
|
Instruction *IsWorker =
|
|
|
|
ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_NE, KernelInitCB,
|
|
|
|
ConstantInt::get(KernelInitCB->getType(), -1),
|
|
|
|
"thread.is_worker", InitBB);
|
|
|
|
IsWorker->setDebugLoc(DLoc);
|
|
|
|
BranchInst::Create(StateMachineBeginBB, UserCodeEntryBB, IsWorker, InitBB);
|
|
|
|
|
|
|
|
// Create local storage for the work function pointer.
|
|
|
|
Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
|
|
|
|
AllocaInst *WorkFnAI = new AllocaInst(VoidPtrTy, 0, "worker.work_fn.addr",
|
|
|
|
&Kernel->getEntryBlock().front());
|
|
|
|
WorkFnAI->setDebugLoc(DLoc);
|
|
|
|
|
|
|
|
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
|
|
|
|
OMPInfoCache.OMPBuilder.updateToLocation(
|
|
|
|
OpenMPIRBuilder::LocationDescription(
|
|
|
|
IRBuilder<>::InsertPoint(StateMachineBeginBB,
|
|
|
|
StateMachineBeginBB->end()),
|
|
|
|
DLoc));
|
|
|
|
|
|
|
|
Value *Ident = KernelInitCB->getArgOperand(0);
|
|
|
|
Value *GTid = KernelInitCB;
|
|
|
|
|
|
|
|
Module &M = *Kernel->getParent();
|
|
|
|
FunctionCallee BarrierFn =
|
|
|
|
OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
|
|
|
|
M, OMPRTL___kmpc_barrier_simple_spmd);
|
|
|
|
CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineBeginBB)
|
|
|
|
->setDebugLoc(DLoc);
|
|
|
|
|
|
|
|
FunctionCallee KernelParallelFn =
|
|
|
|
OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
|
|
|
|
M, OMPRTL___kmpc_kernel_parallel);
|
|
|
|
Instruction *IsActiveWorker = CallInst::Create(
|
|
|
|
KernelParallelFn, {WorkFnAI}, "worker.is_active", StateMachineBeginBB);
|
|
|
|
IsActiveWorker->setDebugLoc(DLoc);
|
|
|
|
Instruction *WorkFn = new LoadInst(VoidPtrTy, WorkFnAI, "worker.work_fn",
|
|
|
|
StateMachineBeginBB);
|
|
|
|
WorkFn->setDebugLoc(DLoc);
|
|
|
|
|
|
|
|
FunctionType *ParallelRegionFnTy = FunctionType::get(
|
|
|
|
Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)},
|
|
|
|
false);
|
|
|
|
Value *WorkFnCast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
|
|
|
|
WorkFn, ParallelRegionFnTy->getPointerTo(), "worker.work_fn.addr_cast",
|
|
|
|
StateMachineBeginBB);
|
|
|
|
|
|
|
|
Instruction *IsDone =
|
|
|
|
ICmpInst::Create(ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFn,
|
|
|
|
Constant::getNullValue(VoidPtrTy), "worker.is_done",
|
|
|
|
StateMachineBeginBB);
|
|
|
|
IsDone->setDebugLoc(DLoc);
|
|
|
|
BranchInst::Create(StateMachineFinishedBB, StateMachineIsActiveCheckBB,
|
|
|
|
IsDone, StateMachineBeginBB)
|
|
|
|
->setDebugLoc(DLoc);
|
|
|
|
|
|
|
|
BranchInst::Create(StateMachineIfCascadeCurrentBB,
|
|
|
|
StateMachineDoneBarrierBB, IsActiveWorker,
|
|
|
|
StateMachineIsActiveCheckBB)
|
|
|
|
->setDebugLoc(DLoc);
|
|
|
|
|
|
|
|
Value *ZeroArg =
|
|
|
|
Constant::getNullValue(ParallelRegionFnTy->getParamType(0));
|
|
|
|
|
|
|
|
// Now that we have most of the CFG skeleton it is time for the if-cascade
|
|
|
|
// that checks the function pointer we got from the runtime against the
|
|
|
|
// parallel regions we expect, if there are any.
|
|
|
|
for (int i = 0, e = ReachedKnownParallelRegions.size(); i < e; ++i) {
|
|
|
|
auto *ParallelRegion = ReachedKnownParallelRegions[i];
|
|
|
|
BasicBlock *PRExecuteBB = BasicBlock::Create(
|
|
|
|
Ctx, "worker_state_machine.parallel_region.execute", Kernel,
|
|
|
|
StateMachineEndParallelBB);
|
|
|
|
CallInst::Create(ParallelRegion, {ZeroArg, GTid}, "", PRExecuteBB)
|
|
|
|
->setDebugLoc(DLoc);
|
|
|
|
BranchInst::Create(StateMachineEndParallelBB, PRExecuteBB)
|
|
|
|
->setDebugLoc(DLoc);
|
|
|
|
|
|
|
|
BasicBlock *PRNextBB =
|
|
|
|
BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
|
|
|
|
Kernel, StateMachineEndParallelBB);
|
|
|
|
|
|
|
|
// Check if we need to compare the pointer at all or if we can just
|
|
|
|
// call the parallel region function.
|
|
|
|
Value *IsPR;
|
|
|
|
if (i + 1 < e || !ReachedUnknownParallelRegions.empty()) {
|
|
|
|
Instruction *CmpI = ICmpInst::Create(
|
|
|
|
ICmpInst::ICmp, llvm::CmpInst::ICMP_EQ, WorkFnCast, ParallelRegion,
|
|
|
|
"worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
|
|
|
|
CmpI->setDebugLoc(DLoc);
|
|
|
|
IsPR = CmpI;
|
|
|
|
} else {
|
|
|
|
IsPR = ConstantInt::getTrue(Ctx);
|
|
|
|
}
|
|
|
|
|
|
|
|
BranchInst::Create(PRExecuteBB, PRNextBB, IsPR,
|
|
|
|
StateMachineIfCascadeCurrentBB)
|
|
|
|
->setDebugLoc(DLoc);
|
|
|
|
StateMachineIfCascadeCurrentBB = PRNextBB;
|
|
|
|
}
|
|
|
|
|
|
|
|
// At the end of the if-cascade we place the indirect function pointer call
|
|
|
|
// in case we might need it, that is if there can be parallel regions we
|
|
|
|
// have not handled in the if-cascade above.
|
|
|
|
if (!ReachedUnknownParallelRegions.empty()) {
|
|
|
|
StateMachineIfCascadeCurrentBB->setName(
|
|
|
|
"worker_state_machine.parallel_region.fallback.execute");
|
|
|
|
CallInst::Create(ParallelRegionFnTy, WorkFnCast, {ZeroArg, GTid}, "",
|
|
|
|
StateMachineIfCascadeCurrentBB)
|
|
|
|
->setDebugLoc(DLoc);
|
|
|
|
}
|
|
|
|
BranchInst::Create(StateMachineEndParallelBB,
|
|
|
|
StateMachineIfCascadeCurrentBB)
|
|
|
|
->setDebugLoc(DLoc);
|
|
|
|
|
|
|
|
CallInst::Create(OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
|
|
|
|
M, OMPRTL___kmpc_kernel_end_parallel),
|
|
|
|
{}, "", StateMachineEndParallelBB)
|
|
|
|
->setDebugLoc(DLoc);
|
|
|
|
BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB)
|
|
|
|
->setDebugLoc(DLoc);
|
|
|
|
|
|
|
|
CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineDoneBarrierBB)
|
|
|
|
->setDebugLoc(DLoc);
|
|
|
|
BranchInst::Create(StateMachineBeginBB, StateMachineDoneBarrierBB)
|
|
|
|
->setDebugLoc(DLoc);
|
|
|
|
|
|
|
|
return ChangeStatus::CHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Fixpoint iteration update function. Will be called every time a dependence
|
|
|
|
/// changed its state (and in the beginning).
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
|
|
KernelInfoState StateBefore = getState();
|
|
|
|
|
2021-06-23 23:33:49 +02:00
|
|
|
// Callback to check a read/write instruction.
|
|
|
|
auto CheckRWInst = [&](Instruction &I) {
|
|
|
|
// We handle calls later.
|
|
|
|
if (isa<CallBase>(I))
|
|
|
|
return true;
|
|
|
|
// We only care about write effects.
|
|
|
|
if (!I.mayWriteToMemory())
|
|
|
|
return true;
|
|
|
|
if (auto *SI = dyn_cast<StoreInst>(&I)) {
|
|
|
|
SmallVector<const Value *> Objects;
|
|
|
|
getUnderlyingObjects(SI->getPointerOperand(), Objects);
|
|
|
|
if (llvm::all_of(Objects,
|
|
|
|
[](const Value *Obj) { return isa<AllocaInst>(Obj); }))
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// For now we give up on everything but stores.
|
|
|
|
SPMDCompatibilityTracker.insert(&I);
|
|
|
|
return true;
|
|
|
|
};
|
2021-07-10 02:09:40 +02:00
|
|
|
|
|
|
|
bool UsedAssumedInformationInCheckRWInst = false;
|
|
|
|
if (!A.checkForAllReadWriteInstructions(
|
|
|
|
CheckRWInst, *this, UsedAssumedInformationInCheckRWInst))
|
2021-06-23 23:33:49 +02:00
|
|
|
SPMDCompatibilityTracker.indicatePessimisticFixpoint();
|
|
|
|
|
[AbstractAttributor] Fold function calls to `__kmpc_is_spmd_exec_mode` if possible
In the device runtime there are many function calls to `__kmpc_is_spmd_exec_mode`
to query the execution mode of current kernels. In many cases, user programs
only contain target region executing in one mode. As a consequence, those runtime
function calls will only return one value. If we can get rid of these function
calls during compliation, it can potentially improve performance.
In this patch, we use `AAKernelInfo` to analyze kernel execution. Basically, for
each kernel (device) function `F`, we collect all kernel entries `K` that can
reach `F`. A new AA, `AAFoldRuntimeCall`, is created for each call site. In each
iteration, it will check all reaching kernel entries, and update the folded value
accordingly.
In the future we will support more function.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D105787
2021-07-16 00:23:12 +02:00
|
|
|
if (!IsKernelEntry)
|
|
|
|
updateReachingKernelEntries(A);
|
|
|
|
|
2021-05-20 07:37:29 +02:00
|
|
|
// Callback to check a call instruction.
|
|
|
|
auto CheckCallInst = [&](Instruction &I) {
|
|
|
|
auto &CB = cast<CallBase>(I);
|
|
|
|
auto &CBAA = A.getAAFor<AAKernelInfo>(
|
|
|
|
*this, IRPosition::callsite_function(CB), DepClassTy::OPTIONAL);
|
|
|
|
if (CBAA.getState().isValidState())
|
|
|
|
getState() ^= CBAA.getState();
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
2021-07-10 02:09:40 +02:00
|
|
|
bool UsedAssumedInformationInCheckCallInst = false;
|
|
|
|
if (!A.checkForAllCallLikeInstructions(
|
|
|
|
CheckCallInst, *this, UsedAssumedInformationInCheckCallInst))
|
2021-05-20 07:37:29 +02:00
|
|
|
return indicatePessimisticFixpoint();
|
|
|
|
|
|
|
|
return StateBefore == getState() ? ChangeStatus::UNCHANGED
|
|
|
|
: ChangeStatus::CHANGED;
|
|
|
|
}
|
[AbstractAttributor] Fold function calls to `__kmpc_is_spmd_exec_mode` if possible
In the device runtime there are many function calls to `__kmpc_is_spmd_exec_mode`
to query the execution mode of current kernels. In many cases, user programs
only contain target region executing in one mode. As a consequence, those runtime
function calls will only return one value. If we can get rid of these function
calls during compliation, it can potentially improve performance.
In this patch, we use `AAKernelInfo` to analyze kernel execution. Basically, for
each kernel (device) function `F`, we collect all kernel entries `K` that can
reach `F`. A new AA, `AAFoldRuntimeCall`, is created for each call site. In each
iteration, it will check all reaching kernel entries, and update the folded value
accordingly.
In the future we will support more function.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D105787
2021-07-16 00:23:12 +02:00
|
|
|
|
|
|
|
private:
|
|
|
|
/// Update info regarding reaching kernels.
|
|
|
|
void updateReachingKernelEntries(Attributor &A) {
|
|
|
|
auto PredCallSite = [&](AbstractCallSite ACS) {
|
|
|
|
Function *Caller = ACS.getInstruction()->getFunction();
|
|
|
|
|
|
|
|
assert(Caller && "Caller is nullptr");
|
|
|
|
|
|
|
|
auto &CAA =
|
|
|
|
A.getOrCreateAAFor<AAKernelInfo>(IRPosition::function(*Caller));
|
|
|
|
if (CAA.ReachingKernelEntries.isValidState()) {
|
|
|
|
ReachingKernelEntries ^= CAA.ReachingKernelEntries;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We lost track of the caller of the associated function, any kernel
|
|
|
|
// could reach now.
|
|
|
|
ReachingKernelEntries.indicatePessimisticFixpoint();
|
|
|
|
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
|
|
|
bool AllCallSitesKnown;
|
|
|
|
if (!A.checkForAllCallSites(PredCallSite, *this,
|
|
|
|
true /* RequireAllCallSites */,
|
|
|
|
AllCallSitesKnown))
|
|
|
|
ReachingKernelEntries.indicatePessimisticFixpoint();
|
|
|
|
}
|
2021-05-20 07:37:29 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
/// The call site kernel info abstract attribute, basically, what can we say
|
|
|
|
/// about a call site with regards to the KernelInfoState. For now this simply
|
|
|
|
/// forwards the information from the callee.
|
|
|
|
struct AAKernelInfoCallSite : AAKernelInfo {
|
|
|
|
AAKernelInfoCallSite(const IRPosition &IRP, Attributor &A)
|
|
|
|
: AAKernelInfo(IRP, A) {}
|
|
|
|
|
|
|
|
/// See AbstractAttribute::initialize(...).
|
|
|
|
void initialize(Attributor &A) override {
|
|
|
|
AAKernelInfo::initialize(A);
|
|
|
|
|
|
|
|
CallBase &CB = cast<CallBase>(getAssociatedValue());
|
|
|
|
Function *Callee = getAssociatedFunction();
|
|
|
|
|
|
|
|
// Helper to lookup an assumption string.
|
|
|
|
auto HasAssumption = [](Function *Fn, StringRef AssumptionStr) {
|
|
|
|
return Fn && hasAssumption(*Fn, AssumptionStr);
|
|
|
|
};
|
|
|
|
|
2021-06-23 23:33:49 +02:00
|
|
|
// Check for SPMD-mode assumptions.
|
|
|
|
if (HasAssumption(Callee, "ompx_spmd_amenable"))
|
|
|
|
SPMDCompatibilityTracker.indicateOptimisticFixpoint();
|
|
|
|
|
2021-05-20 07:37:29 +02:00
|
|
|
// First weed out calls we do not care about, that is readonly/readnone
|
|
|
|
// calls, intrinsics, and "no_openmp" calls. Neither of these can reach a
|
|
|
|
// parallel region or anything else we are looking for.
|
|
|
|
if (!CB.mayWriteToMemory() || isa<IntrinsicInst>(CB)) {
|
|
|
|
indicateOptimisticFixpoint();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Next we check if we know the callee. If it is a known OpenMP function
|
|
|
|
// we will handle them explicitly in the switch below. If it is not, we
|
|
|
|
// will use an AAKernelInfo object on the callee to gather information and
|
|
|
|
// merge that into the current state. The latter happens in the updateImpl.
|
|
|
|
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
|
|
|
|
const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
|
|
|
|
if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
|
|
|
|
// Unknown caller or declarations are not analyzable, we give up.
|
|
|
|
if (!Callee || !A.isFunctionIPOAmendable(*Callee)) {
|
|
|
|
|
|
|
|
// Unknown callees might contain parallel regions, except if they have
|
|
|
|
// an appropriate assumption attached.
|
|
|
|
if (!(HasAssumption(Callee, "omp_no_openmp") ||
|
|
|
|
HasAssumption(Callee, "omp_no_parallelism")))
|
|
|
|
ReachedUnknownParallelRegions.insert(&CB);
|
|
|
|
|
2021-06-23 23:33:49 +02:00
|
|
|
// If SPMDCompatibilityTracker is not fixed, we need to give up on the
|
|
|
|
// idea we can run something unknown in SPMD-mode.
|
|
|
|
if (!SPMDCompatibilityTracker.isAtFixpoint())
|
|
|
|
SPMDCompatibilityTracker.insert(&CB);
|
|
|
|
|
2021-05-20 07:37:29 +02:00
|
|
|
// We have updated the state for this unknown call properly, there won't
|
|
|
|
// be any change so we indicate a fixpoint.
|
|
|
|
indicateOptimisticFixpoint();
|
|
|
|
}
|
|
|
|
// If the callee is known and can be used in IPO, we will update the state
|
|
|
|
// based on the callee state in updateImpl.
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
const unsigned int WrapperFunctionArgNo = 6;
|
|
|
|
RuntimeFunction RF = It->getSecond();
|
|
|
|
switch (RF) {
|
2021-06-23 23:33:49 +02:00
|
|
|
// All the functions we know are compatible with SPMD mode.
|
|
|
|
case OMPRTL___kmpc_is_spmd_exec_mode:
|
|
|
|
case OMPRTL___kmpc_for_static_fini:
|
|
|
|
case OMPRTL___kmpc_global_thread_num:
|
|
|
|
case OMPRTL___kmpc_single:
|
|
|
|
case OMPRTL___kmpc_end_single:
|
|
|
|
case OMPRTL___kmpc_master:
|
|
|
|
case OMPRTL___kmpc_end_master:
|
|
|
|
case OMPRTL___kmpc_barrier:
|
|
|
|
break;
|
|
|
|
case OMPRTL___kmpc_for_static_init_4:
|
|
|
|
case OMPRTL___kmpc_for_static_init_4u:
|
|
|
|
case OMPRTL___kmpc_for_static_init_8:
|
|
|
|
case OMPRTL___kmpc_for_static_init_8u: {
|
|
|
|
// Check the schedule and allow static schedule in SPMD mode.
|
|
|
|
unsigned ScheduleArgOpNo = 2;
|
|
|
|
auto *ScheduleTypeCI =
|
|
|
|
dyn_cast<ConstantInt>(CB.getArgOperand(ScheduleArgOpNo));
|
|
|
|
unsigned ScheduleTypeVal =
|
|
|
|
ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
|
|
|
|
switch (OMPScheduleType(ScheduleTypeVal)) {
|
|
|
|
case OMPScheduleType::Static:
|
|
|
|
case OMPScheduleType::StaticChunked:
|
|
|
|
case OMPScheduleType::Distribute:
|
|
|
|
case OMPScheduleType::DistributeChunked:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
SPMDCompatibilityTracker.insert(&CB);
|
|
|
|
break;
|
|
|
|
};
|
|
|
|
} break;
|
2021-05-20 07:37:29 +02:00
|
|
|
case OMPRTL___kmpc_target_init:
|
|
|
|
KernelInitCB = &CB;
|
|
|
|
break;
|
|
|
|
case OMPRTL___kmpc_target_deinit:
|
|
|
|
KernelDeinitCB = &CB;
|
|
|
|
break;
|
|
|
|
case OMPRTL___kmpc_parallel_51:
|
|
|
|
if (auto *ParallelRegion = dyn_cast<Function>(
|
|
|
|
CB.getArgOperand(WrapperFunctionArgNo)->stripPointerCasts())) {
|
|
|
|
ReachedKnownParallelRegions.insert(ParallelRegion);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// The condition above should usually get the parallel region function
|
|
|
|
// pointer and record it. In the off chance it doesn't we assume the
|
|
|
|
// worst.
|
|
|
|
ReachedUnknownParallelRegions.insert(&CB);
|
|
|
|
break;
|
|
|
|
case OMPRTL___kmpc_omp_task:
|
|
|
|
// We do not look into tasks right now, just give up.
|
2021-06-23 23:33:49 +02:00
|
|
|
SPMDCompatibilityTracker.insert(&CB);
|
2021-05-20 07:37:29 +02:00
|
|
|
ReachedUnknownParallelRegions.insert(&CB);
|
|
|
|
break;
|
|
|
|
default:
|
2021-06-23 23:33:49 +02:00
|
|
|
// Unknown OpenMP runtime calls cannot be executed in SPMD-mode,
|
|
|
|
// generally.
|
|
|
|
SPMDCompatibilityTracker.insert(&CB);
|
2021-05-20 07:37:29 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
// All other OpenMP runtime calls will not reach parallel regions so they
|
|
|
|
// can be safely ignored for now. Since it is a known OpenMP runtime call we
|
|
|
|
// have now modeled all effects and there is no need for any update.
|
|
|
|
indicateOptimisticFixpoint();
|
|
|
|
}
|
|
|
|
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
|
|
// TODO: Once we have call site specific value information we can provide
|
|
|
|
// call site specific liveness information and then it makes
|
|
|
|
// sense to specialize attributes for call sites arguments instead of
|
|
|
|
// redirecting requests to the callee argument.
|
|
|
|
Function *F = getAssociatedFunction();
|
|
|
|
const IRPosition &FnPos = IRPosition::function(*F);
|
|
|
|
auto &FnAA = A.getAAFor<AAKernelInfo>(*this, FnPos, DepClassTy::REQUIRED);
|
|
|
|
if (getState() == FnAA.getState())
|
|
|
|
return ChangeStatus::UNCHANGED;
|
|
|
|
getState() = FnAA.getState();
|
|
|
|
return ChangeStatus::CHANGED;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
[AbstractAttributor] Fold function calls to `__kmpc_is_spmd_exec_mode` if possible
In the device runtime there are many function calls to `__kmpc_is_spmd_exec_mode`
to query the execution mode of current kernels. In many cases, user programs
only contain target region executing in one mode. As a consequence, those runtime
function calls will only return one value. If we can get rid of these function
calls during compliation, it can potentially improve performance.
In this patch, we use `AAKernelInfo` to analyze kernel execution. Basically, for
each kernel (device) function `F`, we collect all kernel entries `K` that can
reach `F`. A new AA, `AAFoldRuntimeCall`, is created for each call site. In each
iteration, it will check all reaching kernel entries, and update the folded value
accordingly.
In the future we will support more function.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D105787
2021-07-16 00:23:12 +02:00
|
|
|
struct AAFoldRuntimeCall
|
|
|
|
: public StateWrapper<BooleanState, AbstractAttribute> {
|
|
|
|
using Base = StateWrapper<BooleanState, AbstractAttribute>;
|
|
|
|
|
|
|
|
AAFoldRuntimeCall(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
|
|
|
|
|
|
|
|
/// Statistics are tracked as part of manifest for now.
|
|
|
|
void trackStatistics() const override {}
|
|
|
|
|
|
|
|
/// Create an abstract attribute biew for the position \p IRP.
|
|
|
|
static AAFoldRuntimeCall &createForPosition(const IRPosition &IRP,
|
|
|
|
Attributor &A);
|
|
|
|
|
|
|
|
/// See AbstractAttribute::getName()
|
|
|
|
const std::string getName() const override { return "AAFoldRuntimeCall"; }
|
|
|
|
|
|
|
|
/// See AbstractAttribute::getIdAddr()
|
|
|
|
const char *getIdAddr() const override { return &ID; }
|
|
|
|
|
|
|
|
/// This function should return true if the type of the \p AA is
|
|
|
|
/// AAFoldRuntimeCall
|
|
|
|
static bool classof(const AbstractAttribute *AA) {
|
|
|
|
return (AA->getIdAddr() == &ID);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char ID;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
|
|
|
|
AAFoldRuntimeCallCallSiteReturned(const IRPosition &IRP, Attributor &A)
|
|
|
|
: AAFoldRuntimeCall(IRP, A) {}
|
|
|
|
|
|
|
|
/// See AbstractAttribute::getAsStr()
|
|
|
|
const std::string getAsStr() const override {
|
|
|
|
if (!isValidState())
|
|
|
|
return "<invalid>";
|
|
|
|
|
|
|
|
std::string Str("simplified value: ");
|
|
|
|
|
|
|
|
if (!SimplifiedValue.hasValue())
|
|
|
|
return Str + std::string("none");
|
|
|
|
|
|
|
|
if (!SimplifiedValue.getValue())
|
|
|
|
return Str + std::string("nullptr");
|
|
|
|
|
|
|
|
if (ConstantInt *CI = dyn_cast<ConstantInt>(SimplifiedValue.getValue()))
|
|
|
|
return Str + std::to_string(CI->getSExtValue());
|
|
|
|
|
|
|
|
return Str + std::string("unknown");
|
|
|
|
}
|
|
|
|
|
|
|
|
void initialize(Attributor &A) override {
|
|
|
|
Function *Callee = getAssociatedFunction();
|
|
|
|
|
|
|
|
auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
|
|
|
|
const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
|
|
|
|
assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
|
|
|
|
"Expected a known OpenMP runtime function");
|
|
|
|
|
|
|
|
RFKind = It->getSecond();
|
|
|
|
|
|
|
|
CallBase &CB = cast<CallBase>(getAssociatedValue());
|
|
|
|
A.registerSimplificationCallback(
|
|
|
|
IRPosition::callsite_returned(CB),
|
|
|
|
[&](const IRPosition &IRP, const AbstractAttribute *AA,
|
|
|
|
bool &UsedAssumedInformation) -> Optional<Value *> {
|
|
|
|
assert((isValidState() || (SimplifiedValue.hasValue() &&
|
|
|
|
SimplifiedValue.getValue() == nullptr)) &&
|
|
|
|
"Unexpected invalid state!");
|
|
|
|
|
|
|
|
if (!isAtFixpoint()) {
|
|
|
|
UsedAssumedInformation = true;
|
|
|
|
if (AA)
|
|
|
|
A.recordDependence(*this, *AA, DepClassTy::OPTIONAL);
|
|
|
|
}
|
|
|
|
return SimplifiedValue;
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
ChangeStatus updateImpl(Attributor &A) override {
|
|
|
|
ChangeStatus Changed = ChangeStatus::UNCHANGED;
|
|
|
|
|
|
|
|
switch (RFKind) {
|
|
|
|
case OMPRTL___kmpc_is_spmd_exec_mode:
|
2021-07-16 05:51:38 +02:00
|
|
|
Changed |= foldIsSPMDExecMode(A);
|
[AbstractAttributor] Fold function calls to `__kmpc_is_spmd_exec_mode` if possible
In the device runtime there are many function calls to `__kmpc_is_spmd_exec_mode`
to query the execution mode of current kernels. In many cases, user programs
only contain target region executing in one mode. As a consequence, those runtime
function calls will only return one value. If we can get rid of these function
calls during compliation, it can potentially improve performance.
In this patch, we use `AAKernelInfo` to analyze kernel execution. Basically, for
each kernel (device) function `F`, we collect all kernel entries `K` that can
reach `F`. A new AA, `AAFoldRuntimeCall`, is created for each call site. In each
iteration, it will check all reaching kernel entries, and update the folded value
accordingly.
In the future we will support more function.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D105787
2021-07-16 00:23:12 +02:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
llvm_unreachable("Unhandled OpenMP runtime function!");
|
|
|
|
}
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
|
|
|
ChangeStatus manifest(Attributor &A) override {
|
|
|
|
ChangeStatus Changed = ChangeStatus::UNCHANGED;
|
|
|
|
|
|
|
|
if (SimplifiedValue.hasValue() && SimplifiedValue.getValue()) {
|
|
|
|
Instruction &CB = *getCtxI();
|
|
|
|
A.changeValueAfterManifest(CB, **SimplifiedValue);
|
|
|
|
A.deleteAfterManifest(CB);
|
|
|
|
Changed = ChangeStatus::CHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
|
|
|
ChangeStatus indicatePessimisticFixpoint() override {
|
|
|
|
SimplifiedValue = nullptr;
|
|
|
|
return AAFoldRuntimeCall::indicatePessimisticFixpoint();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
/// Fold __kmpc_is_spmd_exec_mode into a constant if possible.
|
|
|
|
ChangeStatus foldIsSPMDExecMode(Attributor &A) {
|
|
|
|
Optional<Value *> SimplifiedValueBefore = SimplifiedValue;
|
|
|
|
|
|
|
|
unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
|
|
|
|
unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
|
|
|
|
auto &CallerKernelInfoAA = A.getAAFor<AAKernelInfo>(
|
|
|
|
*this, IRPosition::function(*getAnchorScope()), DepClassTy::REQUIRED);
|
|
|
|
|
|
|
|
if (!CallerKernelInfoAA.ReachingKernelEntries.isValidState())
|
|
|
|
return indicatePessimisticFixpoint();
|
|
|
|
|
|
|
|
for (Kernel K : CallerKernelInfoAA.ReachingKernelEntries) {
|
|
|
|
auto &AA = A.getAAFor<AAKernelInfo>(*this, IRPosition::function(*K),
|
|
|
|
DepClassTy::REQUIRED);
|
|
|
|
|
|
|
|
if (!AA.isValidState()) {
|
|
|
|
SimplifiedValue = nullptr;
|
|
|
|
return indicatePessimisticFixpoint();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (AA.SPMDCompatibilityTracker.isAssumed()) {
|
|
|
|
if (AA.SPMDCompatibilityTracker.isAtFixpoint())
|
|
|
|
++KnownSPMDCount;
|
|
|
|
else
|
|
|
|
++AssumedSPMDCount;
|
|
|
|
} else {
|
|
|
|
if (AA.SPMDCompatibilityTracker.isAtFixpoint())
|
|
|
|
++KnownNonSPMDCount;
|
|
|
|
else
|
|
|
|
++AssumedNonSPMDCount;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (KnownSPMDCount && KnownNonSPMDCount)
|
|
|
|
return indicatePessimisticFixpoint();
|
|
|
|
|
|
|
|
if (AssumedSPMDCount && AssumedNonSPMDCount)
|
|
|
|
return indicatePessimisticFixpoint();
|
|
|
|
|
|
|
|
auto &Ctx = getAnchorValue().getContext();
|
|
|
|
if (KnownSPMDCount || AssumedSPMDCount) {
|
|
|
|
assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
|
|
|
|
"Expected only SPMD kernels!");
|
|
|
|
// All reaching kernels are in SPMD mode. Update all function calls to
|
|
|
|
// __kmpc_is_spmd_exec_mode to 1.
|
|
|
|
SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), true);
|
|
|
|
} else {
|
|
|
|
assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
|
|
|
|
"Expected only non-SPMD kernels!");
|
|
|
|
// All reaching kernels are in non-SPMD mode. Update all function
|
|
|
|
// calls to __kmpc_is_spmd_exec_mode to 0.
|
|
|
|
SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), false);
|
|
|
|
}
|
|
|
|
|
|
|
|
return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
|
|
|
|
: ChangeStatus::CHANGED;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// An optional value the associated value is assumed to fold to. That is, we
|
|
|
|
/// assume the associated value (which is a call) can be replaced by this
|
|
|
|
/// simplified value.
|
|
|
|
Optional<Value *> SimplifiedValue;
|
|
|
|
|
|
|
|
/// The runtime function kind of the callee of the associated call site.
|
|
|
|
RuntimeFunction RFKind;
|
|
|
|
};
|
|
|
|
|
2019-11-07 06:20:06 +01:00
|
|
|
} // namespace
|
|
|
|
|
2021-05-20 07:37:29 +02:00
|
|
|
void OpenMPOpt::registerAAs(bool IsModulePass) {
|
|
|
|
if (SCC.empty())
|
|
|
|
|
|
|
|
return;
|
|
|
|
if (IsModulePass) {
|
|
|
|
// Ensure we create the AAKernelInfo AAs first and without triggering an
|
|
|
|
// update. This will make sure we register all value simplification
|
|
|
|
// callbacks before any other AA has the chance to create an AAValueSimplify
|
|
|
|
// or similar.
|
|
|
|
for (Function *Kernel : OMPInfoCache.Kernels)
|
|
|
|
A.getOrCreateAAFor<AAKernelInfo>(
|
|
|
|
IRPosition::function(*Kernel), /* QueryingAA */ nullptr,
|
|
|
|
DepClassTy::NONE, /* ForceUpdate */ false,
|
|
|
|
/* UpdateAfterInit */ false);
|
[AbstractAttributor] Fold function calls to `__kmpc_is_spmd_exec_mode` if possible
In the device runtime there are many function calls to `__kmpc_is_spmd_exec_mode`
to query the execution mode of current kernels. In many cases, user programs
only contain target region executing in one mode. As a consequence, those runtime
function calls will only return one value. If we can get rid of these function
calls during compliation, it can potentially improve performance.
In this patch, we use `AAKernelInfo` to analyze kernel execution. Basically, for
each kernel (device) function `F`, we collect all kernel entries `K` that can
reach `F`. A new AA, `AAFoldRuntimeCall`, is created for each call site. In each
iteration, it will check all reaching kernel entries, and update the folded value
accordingly.
In the future we will support more function.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D105787
2021-07-16 00:23:12 +02:00
|
|
|
|
|
|
|
auto &IsSPMDRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_is_spmd_exec_mode];
|
|
|
|
IsSPMDRFI.foreachUse(SCC, [&](Use &U, Function &) {
|
|
|
|
CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &IsSPMDRFI);
|
|
|
|
if (!CI)
|
|
|
|
return false;
|
|
|
|
A.getOrCreateAAFor<AAFoldRuntimeCall>(
|
|
|
|
IRPosition::callsite_returned(*CI), /* QueryingAA */ nullptr,
|
|
|
|
DepClassTy::NONE, /* ForceUpdate */ false,
|
|
|
|
/* UpdateAfterInit */ false);
|
|
|
|
return false;
|
|
|
|
});
|
2021-05-20 07:37:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Create CallSite AA for all Getters.
|
|
|
|
for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
|
|
|
|
auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
|
|
|
|
|
|
|
|
auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
|
|
|
|
|
|
|
|
auto CreateAA = [&](Use &U, Function &Caller) {
|
|
|
|
CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
|
|
|
|
if (!CI)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
auto &CB = cast<CallBase>(*CI);
|
|
|
|
|
|
|
|
IRPosition CBPos = IRPosition::callsite_function(CB);
|
|
|
|
A.getOrCreateAAFor<AAICVTracker>(CBPos);
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
|
|
|
|
GetterRFI.foreachUse(SCC, CreateAA);
|
|
|
|
}
|
|
|
|
auto &GlobalizationRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
|
|
|
|
auto CreateAA = [&](Use &U, Function &F) {
|
|
|
|
A.getOrCreateAAFor<AAHeapToShared>(IRPosition::function(F));
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
GlobalizationRFI.foreachUse(SCC, CreateAA);
|
|
|
|
|
|
|
|
// Create an ExecutionDomain AA for every function and a HeapToStack AA for
|
|
|
|
// every function if there is a device kernel.
|
|
|
|
for (auto *F : SCC) {
|
|
|
|
if (!F->isDeclaration())
|
|
|
|
A.getOrCreateAAFor<AAExecutionDomain>(IRPosition::function(*F));
|
|
|
|
if (isOpenMPDevice(M))
|
|
|
|
A.getOrCreateAAFor<AAHeapToStack>(IRPosition::function(*F));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-07-11 01:06:46 +02:00
|
|
|
const char AAICVTracker::ID = 0;
|
2021-05-20 07:37:29 +02:00
|
|
|
const char AAKernelInfo::ID = 0;
|
2021-04-28 22:22:53 +02:00
|
|
|
const char AAExecutionDomain::ID = 0;
|
2021-03-22 21:35:55 +01:00
|
|
|
const char AAHeapToShared::ID = 0;
|
[AbstractAttributor] Fold function calls to `__kmpc_is_spmd_exec_mode` if possible
In the device runtime there are many function calls to `__kmpc_is_spmd_exec_mode`
to query the execution mode of current kernels. In many cases, user programs
only contain target region executing in one mode. As a consequence, those runtime
function calls will only return one value. If we can get rid of these function
calls during compliation, it can potentially improve performance.
In this patch, we use `AAKernelInfo` to analyze kernel execution. Basically, for
each kernel (device) function `F`, we collect all kernel entries `K` that can
reach `F`. A new AA, `AAFoldRuntimeCall`, is created for each call site. In each
iteration, it will check all reaching kernel entries, and update the folded value
accordingly.
In the future we will support more function.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D105787
2021-07-16 00:23:12 +02:00
|
|
|
const char AAFoldRuntimeCall::ID = 0;
|
2020-07-11 01:06:46 +02:00
|
|
|
|
|
|
|
AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
|
|
|
|
Attributor &A) {
|
|
|
|
AAICVTracker *AA = nullptr;
|
|
|
|
switch (IRP.getPositionKind()) {
|
|
|
|
case IRPosition::IRP_INVALID:
|
|
|
|
case IRPosition::IRP_FLOAT:
|
|
|
|
case IRPosition::IRP_ARGUMENT:
|
2020-08-30 11:27:48 +02:00
|
|
|
case IRPosition::IRP_CALL_SITE_ARGUMENT:
|
|
|
|
llvm_unreachable("ICVTracker can only be created for function position!");
|
2020-08-12 12:20:53 +02:00
|
|
|
case IRPosition::IRP_RETURNED:
|
2020-08-30 11:27:48 +02:00
|
|
|
AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A);
|
|
|
|
break;
|
2020-08-12 12:20:53 +02:00
|
|
|
case IRPosition::IRP_CALL_SITE_RETURNED:
|
2020-08-30 11:27:48 +02:00
|
|
|
AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A);
|
|
|
|
break;
|
2020-08-12 12:20:53 +02:00
|
|
|
case IRPosition::IRP_CALL_SITE:
|
2020-08-30 11:27:48 +02:00
|
|
|
AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A);
|
|
|
|
break;
|
2020-08-20 07:00:35 +02:00
|
|
|
case IRPosition::IRP_FUNCTION:
|
|
|
|
AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
|
2020-08-12 12:20:53 +02:00
|
|
|
break;
|
2020-07-11 01:06:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return *AA;
|
|
|
|
}
|
|
|
|
|
2021-04-28 22:22:53 +02:00
|
|
|
AAExecutionDomain &AAExecutionDomain::createForPosition(const IRPosition &IRP,
|
|
|
|
Attributor &A) {
|
|
|
|
AAExecutionDomainFunction *AA = nullptr;
|
|
|
|
switch (IRP.getPositionKind()) {
|
|
|
|
case IRPosition::IRP_INVALID:
|
|
|
|
case IRPosition::IRP_FLOAT:
|
|
|
|
case IRPosition::IRP_ARGUMENT:
|
|
|
|
case IRPosition::IRP_CALL_SITE_ARGUMENT:
|
|
|
|
case IRPosition::IRP_RETURNED:
|
|
|
|
case IRPosition::IRP_CALL_SITE_RETURNED:
|
|
|
|
case IRPosition::IRP_CALL_SITE:
|
|
|
|
llvm_unreachable(
|
|
|
|
"AAExecutionDomain can only be created for function position!");
|
|
|
|
case IRPosition::IRP_FUNCTION:
|
|
|
|
AA = new (A.Allocator) AAExecutionDomainFunction(IRP, A);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return *AA;
|
|
|
|
}
|
|
|
|
|
2021-03-22 21:35:55 +01:00
|
|
|
AAHeapToShared &AAHeapToShared::createForPosition(const IRPosition &IRP,
|
|
|
|
Attributor &A) {
|
|
|
|
AAHeapToSharedFunction *AA = nullptr;
|
|
|
|
switch (IRP.getPositionKind()) {
|
|
|
|
case IRPosition::IRP_INVALID:
|
|
|
|
case IRPosition::IRP_FLOAT:
|
|
|
|
case IRPosition::IRP_ARGUMENT:
|
|
|
|
case IRPosition::IRP_CALL_SITE_ARGUMENT:
|
|
|
|
case IRPosition::IRP_RETURNED:
|
|
|
|
case IRPosition::IRP_CALL_SITE_RETURNED:
|
|
|
|
case IRPosition::IRP_CALL_SITE:
|
|
|
|
llvm_unreachable(
|
|
|
|
"AAHeapToShared can only be created for function position!");
|
|
|
|
case IRPosition::IRP_FUNCTION:
|
|
|
|
AA = new (A.Allocator) AAHeapToSharedFunction(IRP, A);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return *AA;
|
|
|
|
}
|
|
|
|
|
2021-05-20 07:37:29 +02:00
|
|
|
AAKernelInfo &AAKernelInfo::createForPosition(const IRPosition &IRP,
|
|
|
|
Attributor &A) {
|
|
|
|
AAKernelInfo *AA = nullptr;
|
|
|
|
switch (IRP.getPositionKind()) {
|
|
|
|
case IRPosition::IRP_INVALID:
|
|
|
|
case IRPosition::IRP_FLOAT:
|
|
|
|
case IRPosition::IRP_ARGUMENT:
|
|
|
|
case IRPosition::IRP_RETURNED:
|
|
|
|
case IRPosition::IRP_CALL_SITE_RETURNED:
|
|
|
|
case IRPosition::IRP_CALL_SITE_ARGUMENT:
|
|
|
|
llvm_unreachable("KernelInfo can only be created for function position!");
|
|
|
|
case IRPosition::IRP_CALL_SITE:
|
|
|
|
AA = new (A.Allocator) AAKernelInfoCallSite(IRP, A);
|
|
|
|
break;
|
|
|
|
case IRPosition::IRP_FUNCTION:
|
|
|
|
AA = new (A.Allocator) AAKernelInfoFunction(IRP, A);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return *AA;
|
|
|
|
}
|
|
|
|
|
[AbstractAttributor] Fold function calls to `__kmpc_is_spmd_exec_mode` if possible
In the device runtime there are many function calls to `__kmpc_is_spmd_exec_mode`
to query the execution mode of current kernels. In many cases, user programs
only contain target region executing in one mode. As a consequence, those runtime
function calls will only return one value. If we can get rid of these function
calls during compliation, it can potentially improve performance.
In this patch, we use `AAKernelInfo` to analyze kernel execution. Basically, for
each kernel (device) function `F`, we collect all kernel entries `K` that can
reach `F`. A new AA, `AAFoldRuntimeCall`, is created for each call site. In each
iteration, it will check all reaching kernel entries, and update the folded value
accordingly.
In the future we will support more function.
Reviewed By: jdoerfert
Differential Revision: https://reviews.llvm.org/D105787
2021-07-16 00:23:12 +02:00
|
|
|
AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(const IRPosition &IRP,
|
|
|
|
Attributor &A) {
|
|
|
|
AAFoldRuntimeCall *AA = nullptr;
|
|
|
|
switch (IRP.getPositionKind()) {
|
|
|
|
case IRPosition::IRP_INVALID:
|
|
|
|
case IRPosition::IRP_FLOAT:
|
|
|
|
case IRPosition::IRP_ARGUMENT:
|
|
|
|
case IRPosition::IRP_RETURNED:
|
|
|
|
case IRPosition::IRP_FUNCTION:
|
|
|
|
case IRPosition::IRP_CALL_SITE:
|
|
|
|
case IRPosition::IRP_CALL_SITE_ARGUMENT:
|
|
|
|
llvm_unreachable("KernelInfo can only be created for call site position!");
|
|
|
|
case IRPosition::IRP_CALL_SITE_RETURNED:
|
|
|
|
AA = new (A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP, A);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return *AA;
|
|
|
|
}
|
|
|
|
|
2021-03-24 15:11:32 +01:00
|
|
|
PreservedAnalyses OpenMPOptPass::run(Module &M, ModuleAnalysisManager &AM) {
|
2021-06-24 23:11:54 +02:00
|
|
|
if (!containsOpenMP(M))
|
2021-03-24 15:11:32 +01:00
|
|
|
return PreservedAnalyses::all();
|
|
|
|
if (DisableOpenMPOptimizations)
|
|
|
|
return PreservedAnalyses::all();
|
|
|
|
|
2021-06-29 23:05:31 +02:00
|
|
|
FunctionAnalysisManager &FAM =
|
|
|
|
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
|
2021-06-24 23:11:54 +02:00
|
|
|
KernelSet Kernels = getDeviceKernels(M);
|
|
|
|
|
2021-06-25 02:42:31 +02:00
|
|
|
auto IsCalled = [&](Function &F) {
|
|
|
|
if (Kernels.contains(&F))
|
|
|
|
return true;
|
|
|
|
for (const User *U : F.users())
|
|
|
|
if (!isa<BlockAddress>(U))
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
};
|
|
|
|
|
2021-06-29 23:05:31 +02:00
|
|
|
auto EmitRemark = [&](Function &F) {
|
|
|
|
auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
|
|
|
|
ORE.emit([&]() {
|
2021-07-13 21:31:44 +02:00
|
|
|
OptimizationRemarkAnalysis ORA(DEBUG_TYPE, "OMP140", &F);
|
2021-06-30 16:55:14 +02:00
|
|
|
return ORA << "Could not internalize function. "
|
2021-06-29 23:05:31 +02:00
|
|
|
<< "Some optimizations may not be possible.";
|
|
|
|
});
|
|
|
|
};
|
|
|
|
|
2021-06-25 02:42:31 +02:00
|
|
|
// Create internal copies of each function if this is a kernel Module. This
|
|
|
|
// allows iterprocedural passes to see every call edge.
|
2021-05-20 04:57:24 +02:00
|
|
|
DenseSet<const Function *> InternalizedFuncs;
|
2021-06-24 23:11:54 +02:00
|
|
|
if (isOpenMPDevice(M))
|
2021-05-20 04:57:24 +02:00
|
|
|
for (Function &F : M)
|
2021-06-29 23:05:31 +02:00
|
|
|
if (!F.isDeclaration() && !Kernels.contains(&F) && IsCalled(F)) {
|
|
|
|
if (Attributor::internalizeFunction(F, /* Force */ true)) {
|
2021-05-20 04:57:24 +02:00
|
|
|
InternalizedFuncs.insert(&F);
|
2021-06-30 16:55:14 +02:00
|
|
|
} else if (!F.hasLocalLinkage() && !F.hasFnAttribute(Attribute::Cold)) {
|
2021-06-29 23:05:31 +02:00
|
|
|
EmitRemark(F);
|
|
|
|
}
|
|
|
|
}
|
2021-05-20 04:57:24 +02:00
|
|
|
|
2021-06-25 02:42:31 +02:00
|
|
|
// Look at every function in the Module unless it was internalized.
|
2021-03-24 15:11:32 +01:00
|
|
|
SmallVector<Function *, 16> SCC;
|
2021-05-20 04:57:24 +02:00
|
|
|
for (Function &F : M)
|
|
|
|
if (!F.isDeclaration() && !InternalizedFuncs.contains(&F))
|
|
|
|
SCC.push_back(&F);
|
2021-03-24 15:11:32 +01:00
|
|
|
|
|
|
|
if (SCC.empty())
|
|
|
|
return PreservedAnalyses::all();
|
|
|
|
|
|
|
|
AnalysisGetter AG(FAM);
|
|
|
|
|
|
|
|
auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
|
|
|
|
return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
|
|
|
|
};
|
|
|
|
|
|
|
|
BumpPtrAllocator Allocator;
|
|
|
|
CallGraphUpdater CGUpdater;
|
|
|
|
|
|
|
|
SetVector<Function *> Functions(SCC.begin(), SCC.end());
|
2021-06-24 23:11:54 +02:00
|
|
|
OMPInformationCache InfoCache(M, AG, Allocator, /*CGSCC*/ Functions, Kernels);
|
2021-03-24 15:11:32 +01:00
|
|
|
|
2021-06-28 15:48:54 +02:00
|
|
|
unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32;
|
2021-06-25 16:42:00 +02:00
|
|
|
Attributor A(Functions, InfoCache, CGUpdater, nullptr, true, false,
|
2021-06-28 15:48:54 +02:00
|
|
|
MaxFixpointIterations, OREGetter, DEBUG_TYPE);
|
2021-03-24 15:11:32 +01:00
|
|
|
|
|
|
|
OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
|
|
|
|
bool Changed = OMPOpt.run(true);
|
|
|
|
if (Changed)
|
|
|
|
return PreservedAnalyses::none();
|
|
|
|
|
|
|
|
return PreservedAnalyses::all();
|
|
|
|
}
|
|
|
|
|
|
|
|
PreservedAnalyses OpenMPOptCGSCCPass::run(LazyCallGraph::SCC &C,
|
|
|
|
CGSCCAnalysisManager &AM,
|
|
|
|
LazyCallGraph &CG,
|
|
|
|
CGSCCUpdateResult &UR) {
|
2021-06-24 23:11:54 +02:00
|
|
|
if (!containsOpenMP(*C.begin()->getFunction().getParent()))
|
2019-11-07 06:20:06 +01:00
|
|
|
return PreservedAnalyses::all();
|
|
|
|
if (DisableOpenMPOptimizations)
|
|
|
|
return PreservedAnalyses::all();
|
|
|
|
|
2020-04-21 00:51:38 +02:00
|
|
|
SmallVector<Function *, 16> SCC;
|
[OpenMPOpt] Most SCC's are uninteresting, don't waste time on them (up to 16x faster)
Summary:
This seems obvious in hindsight, but the result is surprising.
I've measured compile-time of `-openmpopt` pass standalone
on RawSpeed unity build, and while there is some OpenMP stuff,
most is not OpenMP. But nonetheless the pass does a lot of costly
preparations before ever trying to look for OpenMP stuff in SCC.
Numbers (n=25): 0.094624s -> 0.005976s, an -93.68% improvement, or ~16x
Reviewers: jdoerfert
Reviewed By: jdoerfert
Subscribers: yaxunl, hiraditya, guansong, llvm-commits, sstefan1
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D84689
2020-07-27 22:35:51 +02:00
|
|
|
// If there are kernels in the module, we have to run on all SCC's.
|
|
|
|
for (LazyCallGraph::Node &N : C) {
|
|
|
|
Function *Fn = &N.getFunction();
|
|
|
|
SCC.push_back(Fn);
|
|
|
|
}
|
2019-11-07 06:20:06 +01:00
|
|
|
|
2021-06-24 23:11:54 +02:00
|
|
|
if (SCC.empty())
|
2019-11-07 06:20:06 +01:00
|
|
|
return PreservedAnalyses::all();
|
|
|
|
|
2021-06-24 23:11:54 +02:00
|
|
|
Module &M = *C.begin()->getFunction().getParent();
|
|
|
|
|
|
|
|
KernelSet Kernels = getDeviceKernels(M);
|
|
|
|
|
2020-06-13 23:57:48 +02:00
|
|
|
FunctionAnalysisManager &FAM =
|
|
|
|
AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
|
|
|
|
|
|
|
|
AnalysisGetter AG(FAM);
|
|
|
|
|
|
|
|
auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
|
2020-05-13 19:19:02 +02:00
|
|
|
return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
|
|
|
|
};
|
|
|
|
|
2021-03-24 15:11:32 +01:00
|
|
|
BumpPtrAllocator Allocator;
|
2019-11-07 06:20:06 +01:00
|
|
|
CallGraphUpdater CGUpdater;
|
|
|
|
CGUpdater.initialize(CG, C, AM, UR);
|
2020-06-13 23:57:48 +02:00
|
|
|
|
|
|
|
SetVector<Function *> Functions(SCC.begin(), SCC.end());
|
|
|
|
OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
|
2021-06-24 23:11:54 +02:00
|
|
|
/*CGSCC*/ Functions, Kernels);
|
2020-06-13 23:57:48 +02:00
|
|
|
|
2021-06-28 15:48:54 +02:00
|
|
|
unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32;
|
2021-06-25 16:42:00 +02:00
|
|
|
Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
|
2021-06-28 15:48:54 +02:00
|
|
|
MaxFixpointIterations, OREGetter, DEBUG_TYPE);
|
2020-07-11 01:06:46 +02:00
|
|
|
|
|
|
|
OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
|
2021-03-24 15:11:32 +01:00
|
|
|
bool Changed = OMPOpt.run(false);
|
2020-07-15 02:01:15 +02:00
|
|
|
if (Changed)
|
|
|
|
return PreservedAnalyses::none();
|
|
|
|
|
2019-11-07 06:20:06 +01:00
|
|
|
return PreservedAnalyses::all();
|
|
|
|
}
|
2021-05-13 21:54:22 +02:00
|
|
|
|
2019-11-07 06:20:06 +01:00
|
|
|
namespace {
|
|
|
|
|
2021-03-24 15:11:32 +01:00
|
|
|
struct OpenMPOptCGSCCLegacyPass : public CallGraphSCCPass {
|
2019-11-07 06:20:06 +01:00
|
|
|
CallGraphUpdater CGUpdater;
|
|
|
|
static char ID;
|
|
|
|
|
2021-03-24 15:11:32 +01:00
|
|
|
OpenMPOptCGSCCLegacyPass() : CallGraphSCCPass(ID) {
|
|
|
|
initializeOpenMPOptCGSCCLegacyPassPass(*PassRegistry::getPassRegistry());
|
2019-11-07 06:20:06 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
|
|
CallGraphSCCPass::getAnalysisUsage(AU);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool runOnSCC(CallGraphSCC &CGSCC) override {
|
2021-06-24 23:11:54 +02:00
|
|
|
if (!containsOpenMP(CGSCC.getCallGraph().getModule()))
|
2019-11-07 06:20:06 +01:00
|
|
|
return false;
|
|
|
|
if (DisableOpenMPOptimizations || skipSCC(CGSCC))
|
|
|
|
return false;
|
|
|
|
|
2020-04-21 00:51:38 +02:00
|
|
|
SmallVector<Function *, 16> SCC;
|
[OpenMPOpt] Most SCC's are uninteresting, don't waste time on them (up to 16x faster)
Summary:
This seems obvious in hindsight, but the result is surprising.
I've measured compile-time of `-openmpopt` pass standalone
on RawSpeed unity build, and while there is some OpenMP stuff,
most is not OpenMP. But nonetheless the pass does a lot of costly
preparations before ever trying to look for OpenMP stuff in SCC.
Numbers (n=25): 0.094624s -> 0.005976s, an -93.68% improvement, or ~16x
Reviewers: jdoerfert
Reviewed By: jdoerfert
Subscribers: yaxunl, hiraditya, guansong, llvm-commits, sstefan1
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D84689
2020-07-27 22:35:51 +02:00
|
|
|
// If there are kernels in the module, we have to run on all SCC's.
|
|
|
|
for (CallGraphNode *CGN : CGSCC) {
|
|
|
|
Function *Fn = CGN->getFunction();
|
|
|
|
if (!Fn || Fn->isDeclaration())
|
|
|
|
continue;
|
|
|
|
SCC.push_back(Fn);
|
|
|
|
}
|
|
|
|
|
2021-06-24 23:11:54 +02:00
|
|
|
if (SCC.empty())
|
2019-11-07 06:20:06 +01:00
|
|
|
return false;
|
|
|
|
|
2021-06-24 23:11:54 +02:00
|
|
|
Module &M = CGSCC.getCallGraph().getModule();
|
|
|
|
KernelSet Kernels = getDeviceKernels(M);
|
|
|
|
|
2019-11-07 06:20:06 +01:00
|
|
|
CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
|
|
|
|
CGUpdater.initialize(CG, CGSCC);
|
|
|
|
|
2020-05-13 19:19:02 +02:00
|
|
|
// Maintain a map of functions to avoid rebuilding the ORE
|
|
|
|
DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap;
|
|
|
|
auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & {
|
|
|
|
std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F];
|
|
|
|
if (!ORE)
|
|
|
|
ORE = std::make_unique<OptimizationRemarkEmitter>(F);
|
|
|
|
return *ORE;
|
|
|
|
};
|
|
|
|
|
2020-06-13 23:57:48 +02:00
|
|
|
AnalysisGetter AG;
|
|
|
|
SetVector<Function *> Functions(SCC.begin(), SCC.end());
|
|
|
|
BumpPtrAllocator Allocator;
|
2021-06-24 23:11:54 +02:00
|
|
|
OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG,
|
|
|
|
Allocator,
|
|
|
|
/*CGSCC*/ Functions, Kernels);
|
2020-06-13 23:57:48 +02:00
|
|
|
|
2021-06-28 15:48:54 +02:00
|
|
|
unsigned MaxFixpointIterations = (isOpenMPDevice(M)) ? 128 : 32;
|
2021-06-07 20:31:40 +02:00
|
|
|
Attributor A(Functions, InfoCache, CGUpdater, nullptr, false, true,
|
2021-06-28 15:48:54 +02:00
|
|
|
MaxFixpointIterations, OREGetter, DEBUG_TYPE);
|
2020-07-11 01:06:46 +02:00
|
|
|
|
|
|
|
OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
|
2021-03-24 15:11:32 +01:00
|
|
|
return OMPOpt.run(false);
|
2019-11-07 06:20:06 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
|
|
|
|
};
|
|
|
|
|
|
|
|
} // end anonymous namespace
|
|
|
|
|
2021-06-24 23:11:54 +02:00
|
|
|
KernelSet llvm::omp::getDeviceKernels(Module &M) {
|
|
|
|
// TODO: Create a more cross-platform way of determining device kernels.
|
2020-07-07 02:19:12 +02:00
|
|
|
NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
|
2021-06-24 23:11:54 +02:00
|
|
|
KernelSet Kernels;
|
|
|
|
|
2020-07-07 02:19:12 +02:00
|
|
|
if (!MD)
|
2021-06-24 23:11:54 +02:00
|
|
|
return Kernels;
|
2020-07-07 02:19:12 +02:00
|
|
|
|
|
|
|
for (auto *Op : MD->operands()) {
|
|
|
|
if (Op->getNumOperands() < 2)
|
|
|
|
continue;
|
|
|
|
MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
|
|
|
|
if (!KindID || KindID->getString() != "kernel")
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Function *KernelFn =
|
|
|
|
mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
|
|
|
|
if (!KernelFn)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
++NumOpenMPTargetRegionKernels;
|
|
|
|
|
|
|
|
Kernels.insert(KernelFn);
|
|
|
|
}
|
|
|
|
|
2021-06-24 23:11:54 +02:00
|
|
|
return Kernels;
|
|
|
|
}
|
2020-07-11 09:36:07 +02:00
|
|
|
|
2021-06-24 23:11:54 +02:00
|
|
|
bool llvm::omp::containsOpenMP(Module &M) {
|
|
|
|
Metadata *MD = M.getModuleFlag("openmp");
|
|
|
|
if (!MD)
|
|
|
|
return false;
|
[OpenMPOpt] Most SCC's are uninteresting, don't waste time on them (up to 16x faster)
Summary:
This seems obvious in hindsight, but the result is surprising.
I've measured compile-time of `-openmpopt` pass standalone
on RawSpeed unity build, and while there is some OpenMP stuff,
most is not OpenMP. But nonetheless the pass does a lot of costly
preparations before ever trying to look for OpenMP stuff in SCC.
Numbers (n=25): 0.094624s -> 0.005976s, an -93.68% improvement, or ~16x
Reviewers: jdoerfert
Reviewed By: jdoerfert
Subscribers: yaxunl, hiraditya, guansong, llvm-commits, sstefan1
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D84689
2020-07-27 22:35:51 +02:00
|
|
|
|
2021-06-24 23:11:54 +02:00
|
|
|
return true;
|
|
|
|
}
|
2020-07-07 02:19:12 +02:00
|
|
|
|
2021-06-24 23:11:54 +02:00
|
|
|
bool llvm::omp::isOpenMPDevice(Module &M) {
|
|
|
|
Metadata *MD = M.getModuleFlag("openmp-device");
|
|
|
|
if (!MD)
|
|
|
|
return false;
|
2020-07-07 02:19:12 +02:00
|
|
|
|
2021-06-24 23:11:54 +02:00
|
|
|
return true;
|
2019-11-07 06:20:06 +01:00
|
|
|
}
|
|
|
|
|
2021-03-24 15:11:32 +01:00
|
|
|
char OpenMPOptCGSCCLegacyPass::ID = 0;
|
2019-11-07 06:20:06 +01:00
|
|
|
|
2021-03-24 15:11:32 +01:00
|
|
|
INITIALIZE_PASS_BEGIN(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc",
|
2019-11-07 06:20:06 +01:00
|
|
|
"OpenMP specific optimizations", false, false)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
|
2021-03-24 15:11:32 +01:00
|
|
|
INITIALIZE_PASS_END(OpenMPOptCGSCCLegacyPass, "openmp-opt-cgscc",
|
2019-11-07 06:20:06 +01:00
|
|
|
"OpenMP specific optimizations", false, false)
|
|
|
|
|
2021-03-24 15:11:32 +01:00
|
|
|
Pass *llvm::createOpenMPOptCGSCCLegacyPass() {
|
|
|
|
return new OpenMPOptCGSCCLegacyPass();
|
|
|
|
}
|