mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 20:23:11 +01:00
Re-commit : [PowerPC] Add handling for ColdCC calling convention and a pass to mark
candidates with coldcc attribute. This recommits r322721 reverted due to sanitizer memory leak build bot failures. Original commit message: This patch adds support for the coldcc calling convention for Power. This changes the set of non-volatile registers. It includes a pass to stress test the implementation by marking all static directly called functions with the coldcc attribute through the option -enable-coldcc-stress-test. It also includes an option, -ppc-enable-coldcc, to add the coldcc attribute to functions which are cold at all call sites based on BlockFrequencyInfo when the containing function does not call any non cold functions. Differential Revision: https://reviews.llvm.org/D38413 llvm-svn: 323778
This commit is contained in:
parent
3683524ce7
commit
e49dd688ba
@ -541,6 +541,10 @@ public:
|
||||
/// containing this constant value for the target.
|
||||
bool shouldBuildLookupTablesForConstant(Constant *C) const;
|
||||
|
||||
/// \brief Return true if the input function which is cold at all call sites,
|
||||
/// should use coldcc calling convention.
|
||||
bool useColdCCForColdCall(Function &F) const;
|
||||
|
||||
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
|
||||
|
||||
unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
|
||||
@ -992,6 +996,7 @@ public:
|
||||
virtual unsigned getJumpBufSize() = 0;
|
||||
virtual bool shouldBuildLookupTables() = 0;
|
||||
virtual bool shouldBuildLookupTablesForConstant(Constant *C) = 0;
|
||||
virtual bool useColdCCForColdCall(Function &F) = 0;
|
||||
virtual unsigned
|
||||
getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0;
|
||||
virtual unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
|
||||
@ -1237,6 +1242,10 @@ public:
|
||||
bool shouldBuildLookupTablesForConstant(Constant *C) override {
|
||||
return Impl.shouldBuildLookupTablesForConstant(C);
|
||||
}
|
||||
bool useColdCCForColdCall(Function &F) override {
|
||||
return Impl.useColdCCForColdCall(F);
|
||||
}
|
||||
|
||||
unsigned getScalarizationOverhead(Type *Ty, bool Insert,
|
||||
bool Extract) override {
|
||||
return Impl.getScalarizationOverhead(Ty, Insert, Extract);
|
||||
|
@ -284,6 +284,8 @@ public:
|
||||
bool shouldBuildLookupTables() { return true; }
|
||||
bool shouldBuildLookupTablesForConstant(Constant *C) { return true; }
|
||||
|
||||
bool useColdCCForColdCall(Function &F) { return false; }
|
||||
|
||||
unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) {
|
||||
return 0;
|
||||
}
|
||||
|
@ -226,6 +226,10 @@ bool TargetTransformInfo::shouldBuildLookupTablesForConstant(Constant *C) const
|
||||
return TTIImpl->shouldBuildLookupTablesForConstant(C);
|
||||
}
|
||||
|
||||
bool TargetTransformInfo::useColdCCForColdCall(Function &F) const {
|
||||
return TTIImpl->useColdCCForColdCall(F);
|
||||
}
|
||||
|
||||
unsigned TargetTransformInfo::
|
||||
getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const {
|
||||
return TTIImpl->getScalarizationOverhead(Ty, Insert, Extract);
|
||||
|
@ -45,6 +45,29 @@ def RetCC_PPC64_AnyReg : CallingConv<[
|
||||
CCCustom<"CC_PPC_AnyReg_Error">
|
||||
]>;
|
||||
|
||||
// Return-value convention for PowerPC coldcc.
|
||||
def RetCC_PPC_Cold : CallingConv<[
|
||||
// Use the same return registers as RetCC_PPC, but limited to only
|
||||
// one return value. The remaining return values will be saved to
|
||||
// the stack.
|
||||
CCIfType<[i32, i1], CCIfSubtarget<"isPPC64()", CCPromoteToType<i64>>>,
|
||||
CCIfType<[i1], CCIfNotSubtarget<"isPPC64()", CCPromoteToType<i32>>>,
|
||||
|
||||
CCIfType<[i32], CCAssignToReg<[R3]>>,
|
||||
CCIfType<[i64], CCAssignToReg<[X3]>>,
|
||||
CCIfType<[i128], CCAssignToReg<[X3]>>,
|
||||
|
||||
CCIfType<[f32], CCAssignToReg<[F1]>>,
|
||||
CCIfType<[f64], CCAssignToReg<[F1]>>,
|
||||
|
||||
CCIfType<[v4f64, v4f32, v4i1],
|
||||
CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1]>>>,
|
||||
|
||||
CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
|
||||
CCIfSubtarget<"hasAltivec()",
|
||||
CCAssignToReg<[V2]>>>
|
||||
]>;
|
||||
|
||||
// Return-value convention for PowerPC
|
||||
def RetCC_PPC : CallingConv<[
|
||||
CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_PPC64_AnyReg>>,
|
||||
@ -271,6 +294,36 @@ def CSR_SVR464_R2_Altivec_ViaCopy : CalleeSavedRegs<(add CSR_SVR464_R2_Altivec)>
|
||||
|
||||
def CSR_NoRegs : CalleeSavedRegs<(add)>;
|
||||
|
||||
// coldcc calling convection marks most registers as non-volatile.
|
||||
// Do not include r1 since the stack pointer is never considered a CSR.
|
||||
// Do not include r2, since it is the TOC register and is added depending
|
||||
// on wether or not the function uses the TOC and is a non-leaf.
|
||||
// Do not include r0,r11,r13 as they are optional in functional linkage
|
||||
// and value may be altered by inter-library calls.
|
||||
// Do not include r12 as it is used as a scratch register.
|
||||
// Do not include return registers r3, f1, v2.
|
||||
def CSR_SVR32_ColdCC : CalleeSavedRegs<(add (sequence "R%u", 4, 10),
|
||||
(sequence "R%u", 14, 31),
|
||||
F0, (sequence "F%u", 2, 31),
|
||||
(sequence "CR%u", 0, 7))>;
|
||||
|
||||
def CSR_SVR32_ColdCC_Altivec : CalleeSavedRegs<(add CSR_SVR32_ColdCC,
|
||||
(sequence "V%u", 0, 1),
|
||||
(sequence "V%u", 3, 31))>;
|
||||
|
||||
def CSR_SVR64_ColdCC : CalleeSavedRegs<(add (sequence "X%u", 4, 10),
|
||||
(sequence "X%u", 14, 31),
|
||||
F0, (sequence "F%u", 2, 31),
|
||||
(sequence "CR%u", 0, 7))>;
|
||||
|
||||
def CSR_SVR64_ColdCC_R2: CalleeSavedRegs<(add CSR_SVR64_ColdCC, X2)>;
|
||||
|
||||
def CSR_SVR64_ColdCC_Altivec : CalleeSavedRegs<(add CSR_SVR64_ColdCC,
|
||||
(sequence "V%u", 0, 1),
|
||||
(sequence "V%u", 3, 31))>;
|
||||
|
||||
def CSR_SVR64_ColdCC_R2_Altivec : CalleeSavedRegs<(add CSR_SVR64_ColdCC_Altivec, X2)>;
|
||||
|
||||
def CSR_64_AllRegs: CalleeSavedRegs<(add X0, (sequence "X%u", 3, 10),
|
||||
(sequence "X%u", 14, 31),
|
||||
(sequence "F%u", 0, 31),
|
||||
|
@ -206,6 +206,8 @@ CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) {
|
||||
return CC_PPC32_SVR4_ByVal;
|
||||
else if (Flag == 3)
|
||||
return CC_PPC32_SVR4_VarArg;
|
||||
else if (Flag == 4)
|
||||
return RetCC_PPC_Cold;
|
||||
else
|
||||
return RetCC_PPC;
|
||||
}
|
||||
|
@ -1950,7 +1950,14 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
|
||||
bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4;
|
||||
|
||||
// Add the callee-saved register as live-in; it's killed at the spill.
|
||||
MBB.addLiveIn(Reg);
|
||||
// Do not do this for callee-saved registers that are live-in to the
|
||||
// function because they will already be marked live-in and this will be
|
||||
// adding it for a second time. It is an error to add the same register
|
||||
// to the set more than once.
|
||||
const MachineRegisterInfo &MRI = MF->getRegInfo();
|
||||
bool IsLiveIn = MRI.isLiveIn(Reg);
|
||||
if (!IsLiveIn)
|
||||
MBB.addLiveIn(Reg);
|
||||
|
||||
if (CRSpilled && IsCRField) {
|
||||
CRMIB.addReg(Reg, RegState::ImplicitKill);
|
||||
@ -1980,7 +1987,10 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
|
||||
}
|
||||
} else {
|
||||
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
|
||||
TII.storeRegToStackSlot(MBB, MI, Reg, true,
|
||||
// Use !IsLiveIn for the kill flag.
|
||||
// We do not want to kill registers that are live in this function
|
||||
// before their use because they will become undefined registers.
|
||||
TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn,
|
||||
CSI[i].getFrameIdx(), RC, TRI);
|
||||
}
|
||||
}
|
||||
|
@ -4939,7 +4939,11 @@ SDValue PPCTargetLowering::LowerCallResult(
|
||||
SmallVector<CCValAssign, 16> RVLocs;
|
||||
CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
|
||||
*DAG.getContext());
|
||||
CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
|
||||
|
||||
CCRetInfo.AnalyzeCallResult(
|
||||
Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
|
||||
? RetCC_PPC_Cold
|
||||
: RetCC_PPC);
|
||||
|
||||
// Copy all of the result registers out of their specified physreg.
|
||||
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
|
||||
@ -5159,6 +5163,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
|
||||
// of the 32-bit SVR4 ABI stack frame layout.
|
||||
|
||||
assert((CallConv == CallingConv::C ||
|
||||
CallConv == CallingConv::Cold ||
|
||||
CallConv == CallingConv::Fast) && "Unknown calling convention!");
|
||||
|
||||
unsigned PtrByteSize = 4;
|
||||
@ -6420,7 +6425,10 @@ PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
|
||||
LLVMContext &Context) const {
|
||||
SmallVector<CCValAssign, 16> RVLocs;
|
||||
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
|
||||
return CCInfo.CheckReturn(Outs, RetCC_PPC);
|
||||
return CCInfo.CheckReturn(
|
||||
Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
|
||||
? RetCC_PPC_Cold
|
||||
: RetCC_PPC);
|
||||
}
|
||||
|
||||
SDValue
|
||||
@ -6432,7 +6440,10 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
||||
SmallVector<CCValAssign, 16> RVLocs;
|
||||
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
|
||||
*DAG.getContext());
|
||||
CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
|
||||
CCInfo.AnalyzeReturn(Outs,
|
||||
(Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
|
||||
? RetCC_PPC_Cold
|
||||
: RetCC_PPC);
|
||||
|
||||
SDValue Flag;
|
||||
SmallVector<SDValue, 4> RetOps(1, Chain);
|
||||
|
@ -144,6 +144,17 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
|
||||
// On PPC64, we might need to save r2 (but only if it is not reserved).
|
||||
bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2);
|
||||
|
||||
if (MF->getFunction().getCallingConv() == CallingConv::Cold) {
|
||||
return TM.isPPC64()
|
||||
? (Subtarget.hasAltivec()
|
||||
? (SaveR2 ? CSR_SVR64_ColdCC_R2_Altivec_SaveList
|
||||
: CSR_SVR64_ColdCC_Altivec_SaveList)
|
||||
: (SaveR2 ? CSR_SVR64_ColdCC_R2_SaveList
|
||||
: CSR_SVR64_ColdCC_SaveList))
|
||||
: (Subtarget.hasAltivec() ? CSR_SVR32_ColdCC_Altivec_SaveList
|
||||
: CSR_SVR32_ColdCC_SaveList);
|
||||
}
|
||||
|
||||
return TM.isPPC64()
|
||||
? (Subtarget.hasAltivec()
|
||||
? (SaveR2 ? CSR_SVR464_R2_Altivec_SaveList
|
||||
@ -196,6 +207,13 @@ PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
|
||||
: (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_RegMask
|
||||
: CSR_Darwin32_RegMask);
|
||||
|
||||
if (CC == CallingConv::Cold) {
|
||||
return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR64_ColdCC_Altivec_RegMask
|
||||
: CSR_SVR64_ColdCC_RegMask)
|
||||
: (Subtarget.hasAltivec() ? CSR_SVR32_ColdCC_Altivec_RegMask
|
||||
: CSR_SVR32_ColdCC_RegMask);
|
||||
}
|
||||
|
||||
return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR464_Altivec_RegMask
|
||||
: CSR_SVR464_RegMask)
|
||||
: (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_RegMask
|
||||
|
@ -27,6 +27,11 @@ static cl::opt<unsigned>
|
||||
CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
|
||||
cl::desc("The loop prefetch cache line size"));
|
||||
|
||||
static cl::opt<bool>
|
||||
EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
|
||||
cl::desc("Enable using coldcc calling conv for cold "
|
||||
"internal functions"));
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// PPC cost model.
|
||||
@ -215,6 +220,14 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
|
||||
BaseT::getUnrollingPreferences(L, SE, UP);
|
||||
}
|
||||
|
||||
// This function returns true to allow using coldcc calling convention.
|
||||
// Returning true results in coldcc being used for functions which are cold at
|
||||
// all call sites when the callers of the functions are not calling any other
|
||||
// non coldcc functions.
|
||||
bool PPCTTIImpl::useColdCCForColdCall(Function &F) {
|
||||
return EnablePPCColdCC;
|
||||
}
|
||||
|
||||
bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
|
||||
// On the A2, always unroll aggressively. For QPX unaligned loads, we depend
|
||||
// on combining the loads generated for consecutive accesses, and failure to
|
||||
|
@ -61,7 +61,7 @@ public:
|
||||
|
||||
/// \name Vector TTI Implementations
|
||||
/// @{
|
||||
|
||||
bool useColdCCForColdCall(Function &F);
|
||||
bool enableAggressiveInterleaving(bool LoopHasReductions);
|
||||
const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
|
||||
bool IsZeroCmp) const;
|
||||
|
@ -22,9 +22,11 @@
|
||||
#include "llvm/ADT/Statistic.h"
|
||||
#include "llvm/ADT/Twine.h"
|
||||
#include "llvm/ADT/iterator_range.h"
|
||||
#include "llvm/Analysis/BlockFrequencyInfo.h"
|
||||
#include "llvm/Analysis/ConstantFolding.h"
|
||||
#include "llvm/Analysis/MemoryBuiltins.h"
|
||||
#include "llvm/Analysis/TargetLibraryInfo.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/BinaryFormat/Dwarf.h"
|
||||
#include "llvm/IR/Attributes.h"
|
||||
#include "llvm/IR/BasicBlock.h"
|
||||
@ -55,6 +57,7 @@
|
||||
#include "llvm/Pass.h"
|
||||
#include "llvm/Support/AtomicOrdering.h"
|
||||
#include "llvm/Support/Casting.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include "llvm/Support/MathExtras.h"
|
||||
@ -88,6 +91,21 @@ STATISTIC(NumNestRemoved , "Number of nest attributes removed");
|
||||
STATISTIC(NumAliasesResolved, "Number of global aliases resolved");
|
||||
STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
|
||||
STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
|
||||
STATISTIC(NumInternalFunc, "Number of internal functions");
|
||||
STATISTIC(NumColdCC, "Number of functions marked coldcc");
|
||||
|
||||
static cl::opt<bool>
|
||||
EnableColdCCStressTest("enable-coldcc-stress-test",
|
||||
cl::desc("Enable stress test of coldcc by adding "
|
||||
"calling conv to all internal functions."),
|
||||
cl::init(false), cl::Hidden);
|
||||
|
||||
static cl::opt<int> ColdCCRelFreq(
|
||||
"coldcc-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
|
||||
cl::desc(
|
||||
"Maximum block frequency, expressed as a percentage of caller's "
|
||||
"entry frequency, for a call site to be considered cold for enabling"
|
||||
"coldcc"));
|
||||
|
||||
/// Is this global variable possibly used by a leak checker as a root? If so,
|
||||
/// we might not really want to eliminate the stores to it.
|
||||
@ -2095,20 +2113,114 @@ static void RemoveNestAttribute(Function *F) {
|
||||
/// idea here is that we don't want to mess with the convention if the user
|
||||
/// explicitly requested something with performance implications like coldcc,
|
||||
/// GHC, or anyregcc.
|
||||
static bool isProfitableToMakeFastCC(Function *F) {
|
||||
static bool hasChangeableCC(Function *F) {
|
||||
CallingConv::ID CC = F->getCallingConv();
|
||||
// FIXME: Is it worth transforming x86_stdcallcc and x86_fastcallcc?
|
||||
return CC == CallingConv::C || CC == CallingConv::X86_ThisCall;
|
||||
}
|
||||
|
||||
/// Return true if the block containing the call site has a BlockFrequency of
|
||||
/// less than ColdCCRelFreq% of the entry block.
|
||||
static bool isColdCallSite(CallSite CS, BlockFrequencyInfo &CallerBFI) {
|
||||
const BranchProbability ColdProb(ColdCCRelFreq, 100);
|
||||
auto CallSiteBB = CS.getInstruction()->getParent();
|
||||
auto CallSiteFreq = CallerBFI.getBlockFreq(CallSiteBB);
|
||||
auto CallerEntryFreq =
|
||||
CallerBFI.getBlockFreq(&(CS.getCaller()->getEntryBlock()));
|
||||
return CallSiteFreq < CallerEntryFreq * ColdProb;
|
||||
}
|
||||
|
||||
// This function checks if the input function F is cold at all call sites. It
|
||||
// also looks each call site's containing function, returning false if the
|
||||
// caller function contains other non cold calls. The input vector AllCallsCold
|
||||
// contains a list of functions that only have call sites in cold blocks.
|
||||
static bool
|
||||
isValidCandidateForColdCC(Function &F,
|
||||
function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
|
||||
const std::vector<Function *> &AllCallsCold) {
|
||||
|
||||
if (F.user_empty())
|
||||
return false;
|
||||
|
||||
for (User *U : F.users()) {
|
||||
if (isa<BlockAddress>(U))
|
||||
continue;
|
||||
|
||||
CallSite CS(cast<Instruction>(U));
|
||||
Function *CallerFunc = CS.getInstruction()->getParent()->getParent();
|
||||
BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc);
|
||||
if (!isColdCallSite(CS, CallerBFI))
|
||||
return false;
|
||||
auto It = std::find(AllCallsCold.begin(), AllCallsCold.end(), CallerFunc);
|
||||
if (It == AllCallsCold.end())
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static void changeCallSitesToColdCC(Function *F) {
|
||||
for (User *U : F->users()) {
|
||||
if (isa<BlockAddress>(U))
|
||||
continue;
|
||||
CallSite CS(cast<Instruction>(U));
|
||||
CS.setCallingConv(CallingConv::Cold);
|
||||
}
|
||||
}
|
||||
|
||||
// This function iterates over all the call instructions in the input Function
|
||||
// and checks that all call sites are in cold blocks and are allowed to use the
|
||||
// coldcc calling convention.
|
||||
static bool
|
||||
hasOnlyColdCalls(Function &F,
|
||||
function_ref<BlockFrequencyInfo &(Function &)> GetBFI) {
|
||||
for (BasicBlock &BB : F) {
|
||||
for (Instruction &I : BB) {
|
||||
if (CallInst *CI = dyn_cast<CallInst>(&I)) {
|
||||
CallSite CS(cast<Instruction>(CI));
|
||||
// Skip over isline asm instructions since they aren't function calls.
|
||||
if (CI->isInlineAsm())
|
||||
continue;
|
||||
Function *CalledFn = CI->getCalledFunction();
|
||||
if (!CalledFn)
|
||||
return false;
|
||||
if (!CalledFn->hasLocalLinkage())
|
||||
return false;
|
||||
// Skip over instrinsics since they won't remain as function calls.
|
||||
if (CalledFn->getIntrinsicID() != Intrinsic::not_intrinsic)
|
||||
continue;
|
||||
// Check if it's valid to use coldcc calling convention.
|
||||
if (!hasChangeableCC(CalledFn) || CalledFn->isVarArg() ||
|
||||
CalledFn->hasAddressTaken())
|
||||
return false;
|
||||
BlockFrequencyInfo &CallerBFI = GetBFI(F);
|
||||
if (!isColdCallSite(CS, CallerBFI))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
|
||||
function_ref<TargetTransformInfo &(Function &)> GetTTI,
|
||||
function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
|
||||
function_ref<DominatorTree &(Function &)> LookupDomTree,
|
||||
SmallSet<const Comdat *, 8> &NotDiscardableComdats) {
|
||||
|
||||
bool Changed = false;
|
||||
|
||||
std::vector<Function *> AllCallsCold;
|
||||
for (Module::iterator FI = M.begin(), E = M.end(); FI != E;) {
|
||||
Function *F = &*FI++;
|
||||
if (hasOnlyColdCalls(*F, GetBFI))
|
||||
AllCallsCold.push_back(F);
|
||||
}
|
||||
|
||||
// Optimize functions.
|
||||
for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
|
||||
Function *F = &*FI++;
|
||||
|
||||
// Functions without names cannot be referenced outside this module.
|
||||
if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage())
|
||||
F->setLinkage(GlobalValue::InternalLinkage);
|
||||
@ -2140,7 +2252,25 @@ OptimizeFunctions(Module &M, TargetLibraryInfo *TLI,
|
||||
|
||||
if (!F->hasLocalLinkage())
|
||||
continue;
|
||||
if (isProfitableToMakeFastCC(F) && !F->isVarArg() &&
|
||||
|
||||
if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) {
|
||||
NumInternalFunc++;
|
||||
TargetTransformInfo &TTI = GetTTI(*F);
|
||||
// Change the calling convention to coldcc if either stress testing is
|
||||
// enabled or the target would like to use coldcc on functions which are
|
||||
// cold at all call sites and the callers contain no other non coldcc
|
||||
// calls.
|
||||
if (EnableColdCCStressTest ||
|
||||
(isValidCandidateForColdCC(*F, GetBFI, AllCallsCold) &&
|
||||
TTI.useColdCCForColdCall(*F))) {
|
||||
F->setCallingConv(CallingConv::Cold);
|
||||
changeCallSitesToColdCC(F);
|
||||
Changed = true;
|
||||
NumColdCC++;
|
||||
}
|
||||
}
|
||||
|
||||
if (hasChangeableCC(F) && !F->isVarArg() &&
|
||||
!F->hasAddressTaken()) {
|
||||
// If this function has a calling convention worth changing, is not a
|
||||
// varargs function, and is only called directly, promote it to use the
|
||||
@ -2618,6 +2748,8 @@ static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
|
||||
|
||||
static bool optimizeGlobalsInModule(
|
||||
Module &M, const DataLayout &DL, TargetLibraryInfo *TLI,
|
||||
function_ref<TargetTransformInfo &(Function &)> GetTTI,
|
||||
function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
|
||||
function_ref<DominatorTree &(Function &)> LookupDomTree) {
|
||||
SmallSet<const Comdat *, 8> NotDiscardableComdats;
|
||||
bool Changed = false;
|
||||
@ -2640,8 +2772,8 @@ static bool optimizeGlobalsInModule(
|
||||
NotDiscardableComdats.insert(C);
|
||||
|
||||
// Delete functions that are trivially dead, ccc -> fastcc
|
||||
LocalChange |=
|
||||
OptimizeFunctions(M, TLI, LookupDomTree, NotDiscardableComdats);
|
||||
LocalChange |= OptimizeFunctions(M, TLI, GetTTI, GetBFI, LookupDomTree,
|
||||
NotDiscardableComdats);
|
||||
|
||||
// Optimize global_ctors list.
|
||||
LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) {
|
||||
@ -2678,7 +2810,15 @@ PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) {
|
||||
auto LookupDomTree = [&FAM](Function &F) -> DominatorTree &{
|
||||
return FAM.getResult<DominatorTreeAnalysis>(F);
|
||||
};
|
||||
if (!optimizeGlobalsInModule(M, DL, &TLI, LookupDomTree))
|
||||
auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
|
||||
return FAM.getResult<TargetIRAnalysis>(F);
|
||||
};
|
||||
|
||||
auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
|
||||
return FAM.getResult<BlockFrequencyAnalysis>(F);
|
||||
};
|
||||
|
||||
if (!optimizeGlobalsInModule(M, DL, &TLI, GetTTI, GetBFI, LookupDomTree))
|
||||
return PreservedAnalyses::all();
|
||||
return PreservedAnalyses::none();
|
||||
}
|
||||
@ -2701,12 +2841,22 @@ struct GlobalOptLegacyPass : public ModulePass {
|
||||
auto LookupDomTree = [this](Function &F) -> DominatorTree & {
|
||||
return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
|
||||
};
|
||||
return optimizeGlobalsInModule(M, DL, TLI, LookupDomTree);
|
||||
auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
|
||||
return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
|
||||
};
|
||||
|
||||
auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & {
|
||||
return this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
|
||||
};
|
||||
|
||||
return optimizeGlobalsInModule(M, DL, TLI, GetTTI, GetBFI, LookupDomTree);
|
||||
}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addRequired<TargetLibraryInfoWrapperPass>();
|
||||
AU.addRequired<TargetTransformInfoWrapperPass>();
|
||||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
AU.addRequired<BlockFrequencyInfoWrapperPass>();
|
||||
}
|
||||
};
|
||||
|
||||
@ -2717,6 +2867,8 @@ char GlobalOptLegacyPass::ID = 0;
|
||||
INITIALIZE_PASS_BEGIN(GlobalOptLegacyPass, "globalopt",
|
||||
"Global Variable Optimizer", false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
|
||||
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
|
||||
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
|
||||
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
|
||||
INITIALIZE_PASS_END(GlobalOptLegacyPass, "globalopt",
|
||||
"Global Variable Optimizer", false, false)
|
||||
|
46
test/CodeGen/PowerPC/coldcc.ll
Normal file
46
test/CodeGen/PowerPC/coldcc.ll
Normal file
@ -0,0 +1,46 @@
|
||||
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefix=COLDCC
|
||||
|
||||
define signext i32 @caller(i32 signext %a, i32 signext %b, i32 signext %cold) {
|
||||
entry:
|
||||
%0 = tail call i32 asm "add $0, $1, $2", "=r,r,r,~{r14},~{r15},~{r16},~{r17},~{r18},~{r19},~{r20},~{r21},~{r22},~{r23},~{r24},~{r25},~{r26},~{r27},~{r28},~{r29},~{r30},~{r31}"(i32 %a, i32 %b)
|
||||
%mul = mul nsw i32 %0, %cold
|
||||
%tobool = icmp eq i32 %cold, 0
|
||||
br i1 %tobool, label %if.end, label %if.then
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%mul1 = mul nsw i32 %mul, %cold
|
||||
%mul2 = mul nsw i32 %b, %a
|
||||
%call = tail call coldcc signext i32 @callee(i32 signext %a, i32 signext %b)
|
||||
%add = add i32 %mul2, %a
|
||||
%add3 = add i32 %add, %mul
|
||||
%add4 = add i32 %add3, %mul1
|
||||
%add5 = add i32 %add4, %call
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %entry, %if.then
|
||||
%f.0 = phi i32 [ %add5, %if.then ], [ %0, %entry ]
|
||||
ret i32 %f.0
|
||||
}
|
||||
|
||||
define internal coldcc signext i32 @callee(i32 signext %a, i32 signext %b) local_unnamed_addr #0 {
|
||||
entry:
|
||||
; COLDCC: @callee
|
||||
; COLDCC: std 6, -8(1)
|
||||
; COLDCC: std 7, -16(1)
|
||||
; COLDCC: std 8, -24(1)
|
||||
; COLDCC: std 9, -32(1)
|
||||
; COLDCC: std 10, -40(1)
|
||||
; COLDCC: ld 9, -32(1)
|
||||
; COLDCC: ld 8, -24(1)
|
||||
; COLDCC: ld 7, -16(1)
|
||||
; COLDCC: ld 10, -40(1)
|
||||
; COLDCC: ld 6, -8(1)
|
||||
%0 = tail call i32 asm "add $0, $1, $2", "=r,r,r,~{r6},~{r7},~{r8},~{r9},~{r10}"(i32 %a, i32 %b)
|
||||
%mul = mul nsw i32 %a, 3
|
||||
%1 = mul i32 %b, -5
|
||||
%add = add i32 %1, %mul
|
||||
%sub = add i32 %add, %0
|
||||
ret i32 %sub
|
||||
}
|
||||
|
||||
attributes #0 = { noinline }
|
42
test/CodeGen/PowerPC/coldcc2.ll
Normal file
42
test/CodeGen/PowerPC/coldcc2.ll
Normal file
@ -0,0 +1,42 @@
|
||||
; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefix=COLDCC
|
||||
|
||||
%struct.MyStruct = type { i32, i32, i32, i32 }
|
||||
|
||||
@caller.s = internal unnamed_addr global %struct.MyStruct zeroinitializer, align 8
|
||||
|
||||
define signext i32 @caller(i32 signext %a, i32 signext %b, i32 signext %cold) {
|
||||
entry:
|
||||
; COLDCC: bl callee
|
||||
; COLDCC: ld 4, 40(1)
|
||||
; COLDCC: ld 5, 32(1)
|
||||
%call = tail call coldcc { i64, i64 } @callee(i32 signext %a, i32 signext %b)
|
||||
%0 = extractvalue { i64, i64 } %call, 0
|
||||
%1 = extractvalue { i64, i64 } %call, 1
|
||||
store i64 %0, i64* bitcast (%struct.MyStruct* @caller.s to i64*), align 8
|
||||
store i64 %1, i64* bitcast (i32* getelementptr inbounds (%struct.MyStruct, %struct.MyStruct* @caller.s, i64 0, i32 2) to i64*), align 8
|
||||
%2 = lshr i64 %1, 32
|
||||
%3 = trunc i64 %2 to i32
|
||||
%sub = sub nsw i32 0, %3
|
||||
ret i32 %sub
|
||||
}
|
||||
|
||||
define internal coldcc { i64, i64 } @callee(i32 signext %a, i32 signext %b) {
|
||||
entry:
|
||||
; COLDCC: std {{[0-9]+}}, 0(3)
|
||||
; COLDCC: std {{[0-9]+}}, 8(3)
|
||||
%0 = tail call i32 asm "add $0, $1, $2", "=r,r,r,~{r6},~{r7},~{r8},~{r9},~{r10}"(i32 %a, i32 %b)
|
||||
%mul = mul nsw i32 %a, 3
|
||||
%1 = mul i32 %b, -5
|
||||
%add = add i32 %1, %mul
|
||||
%sub = add i32 %add, %0
|
||||
%mul5 = mul nsw i32 %b, %a
|
||||
%add6 = add nsw i32 %sub, %mul5
|
||||
%retval.sroa.0.0.insert.ext = zext i32 %0 to i64
|
||||
%retval.sroa.3.8.insert.ext = zext i32 %sub to i64
|
||||
%retval.sroa.3.12.insert.ext = zext i32 %add6 to i64
|
||||
%retval.sroa.3.12.insert.shift = shl nuw i64 %retval.sroa.3.12.insert.ext, 32
|
||||
%retval.sroa.3.12.insert.insert = or i64 %retval.sroa.3.12.insert.shift, %retval.sroa.3.8.insert.ext
|
||||
%.fca.0.insert = insertvalue { i64, i64 } undef, i64 %retval.sroa.0.0.insert.ext, 0
|
||||
%.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %retval.sroa.3.12.insert.insert, 1
|
||||
ret { i64, i64 } %.fca.1.insert
|
||||
}
|
@ -93,7 +93,7 @@
|
||||
; FIXME: There really shouldn't be another pass manager, especially one that
|
||||
; just builds the domtree. It doesn't even run the verifier.
|
||||
; CHECK-O2: Pass Arguments:
|
||||
; CHECK-O2-NEXT: FunctionPass Manager
|
||||
; CHECK-O2: FunctionPass Manager
|
||||
; CHECK-O2-NEXT: Dominator Tree Construction
|
||||
|
||||
define void @foo() {
|
||||
|
81
test/Transforms/GlobalOpt/PowerPC/coldcc_coldsites.ll
Normal file
81
test/Transforms/GlobalOpt/PowerPC/coldcc_coldsites.ll
Normal file
@ -0,0 +1,81 @@
|
||||
; RUN: opt -globalopt -mtriple=powerpc64le-unknown-linux-gnu -ppc-enable-coldcc -S < %s | FileCheck %s -check-prefix=COLDCC
|
||||
; RUN: opt -globalopt -S < %s | FileCheck %s -check-prefix=CHECK
|
||||
|
||||
define signext i32 @caller(i32 signext %a, i32 signext %b, i32 signext %lim, i32 signext %i) local_unnamed_addr #0 !prof !30 {
|
||||
entry:
|
||||
; COLDCC: call coldcc signext i32 @callee
|
||||
; CHECK: call fastcc signext i32 @callee
|
||||
%add = add nsw i32 %b, %a
|
||||
%sub = add nsw i32 %lim, -1
|
||||
%cmp = icmp eq i32 %sub, %i
|
||||
br i1 %cmp, label %if.then, label %if.end, !prof !31
|
||||
|
||||
if.then: ; preds = %entry
|
||||
%call = tail call signext i32 @callee(i32 signext %a, i32 signext %b)
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.then, %entry
|
||||
%f.0 = phi i32 [ %call, %if.then ], [ %add, %entry ]
|
||||
ret i32 %f.0
|
||||
}
|
||||
|
||||
define internal signext i32 @callee(i32 signext %a, i32 signext %b) unnamed_addr #0 {
|
||||
entry:
|
||||
%0 = tail call i32 asm "add $0, $1, $2", "=r,r,r,~{r6},~{r7},~{r8},~{r9}"(i32 %a, i32 %b) #1, !srcloc !32
|
||||
%mul = mul nsw i32 %a, 3
|
||||
%mul1 = shl i32 %0, 1
|
||||
%add = add nsw i32 %mul1, %mul
|
||||
ret i32 %add
|
||||
}
|
||||
|
||||
define signext i32 @main() local_unnamed_addr #0 !prof !33 {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup: ; preds = %for.body
|
||||
%add.lcssa = phi i32 [ %add, %for.body ]
|
||||
ret i32 %add.lcssa
|
||||
|
||||
for.body: ; preds = %for.body, %entry
|
||||
%i.011 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
|
||||
%ret.010 = phi i32 [ 0, %entry ], [ %add, %for.body ]
|
||||
%call = tail call signext i32 @caller(i32 signext 4, i32 signext 5, i32 signext 10000000, i32 signext %i.011)
|
||||
%add = add nsw i32 %call, %ret.010
|
||||
%inc = add nuw nsw i32 %i.011, 1
|
||||
%exitcond = icmp eq i32 %inc, 10000000
|
||||
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !34
|
||||
}
|
||||
attributes #0 = { noinline }
|
||||
|
||||
!0 = !{i32 1, !"ProfileSummary", !1}
|
||||
!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
|
||||
!2 = !{!"ProfileFormat", !"InstrProf"}
|
||||
!3 = !{!"TotalCount", i64 20000003}
|
||||
!4 = !{!"MaxCount", i64 10000000}
|
||||
!5 = !{!"MaxInternalCount", i64 10000000}
|
||||
!6 = !{!"MaxFunctionCount", i64 10000000}
|
||||
!7 = !{!"NumCounts", i64 5}
|
||||
!8 = !{!"NumFunctions", i64 3}
|
||||
!9 = !{!"DetailedSummary", !10}
|
||||
!10 = !{!11, !12, !13, !14, !15, !16, !16, !17, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26}
|
||||
!11 = !{i32 10000, i64 10000000, i32 2}
|
||||
!12 = !{i32 100000, i64 10000000, i32 2}
|
||||
!13 = !{i32 200000, i64 10000000, i32 2}
|
||||
!14 = !{i32 300000, i64 10000000, i32 2}
|
||||
!15 = !{i32 400000, i64 10000000, i32 2}
|
||||
!16 = !{i32 500000, i64 10000000, i32 2}
|
||||
!17 = !{i32 600000, i64 10000000, i32 2}
|
||||
!18 = !{i32 700000, i64 10000000, i32 2}
|
||||
!19 = !{i32 800000, i64 10000000, i32 2}
|
||||
!20 = !{i32 900000, i64 10000000, i32 2}
|
||||
!21 = !{i32 950000, i64 10000000, i32 2}
|
||||
!22 = !{i32 990000, i64 10000000, i32 2}
|
||||
!23 = !{i32 999000, i64 10000000, i32 2}
|
||||
!24 = !{i32 999900, i64 10000000, i32 2}
|
||||
!25 = !{i32 999990, i64 10000000, i32 2}
|
||||
!26 = !{i32 999999, i64 10000000, i32 2}
|
||||
!30 = !{!"function_entry_count", i64 10000000}
|
||||
!31 = !{!"branch_weights", i32 2, i32 10000000}
|
||||
!32 = !{i32 59}
|
||||
!33 = !{!"function_entry_count", i64 1}
|
||||
!34 = !{!"branch_weights", i32 2, i32 10000001}
|
3
test/Transforms/GlobalOpt/PowerPC/lit.local.cfg
Normal file
3
test/Transforms/GlobalOpt/PowerPC/lit.local.cfg
Normal file
@ -0,0 +1,3 @@
|
||||
if not 'PowerPC' in config.root.targets:
|
||||
config.unsupported = True
|
||||
|
48
test/Transforms/GlobalOpt/coldcc_stress_test.ll
Normal file
48
test/Transforms/GlobalOpt/coldcc_stress_test.ll
Normal file
@ -0,0 +1,48 @@
|
||||
; RUN: opt < %s -globalopt -S -enable-coldcc-stress-test -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s -check-prefix=COLDCC
|
||||
; RUN: opt < %s -globalopt -S | FileCheck %s -check-prefix=CHECK
|
||||
|
||||
define internal i32 @callee_default(i32* %m) {
|
||||
; COLDCC-LABEL: define internal coldcc i32 @callee_default
|
||||
; CHECK-LABEL: define internal fastcc i32 @callee_default
|
||||
%v = load i32, i32* %m
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
define internal fastcc i32 @callee_fastcc(i32* %m) {
|
||||
; COLDCC-LABEL: define internal fastcc i32 @callee_fastcc
|
||||
; CHECK-LABEL: define internal fastcc i32 @callee_fastcc
|
||||
%v = load i32, i32* %m
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
define internal coldcc i32 @callee_coldcc(i32* %m) {
|
||||
; COLDCC-LABEL: define internal coldcc i32 @callee_coldcc
|
||||
; CHECK-LABEL: define internal coldcc i32 @callee_coldcc
|
||||
%v = load i32, i32* %m
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
define i32 @callee(i32* %m) {
|
||||
%v = load i32, i32* %m
|
||||
ret i32 %v
|
||||
}
|
||||
|
||||
define void @caller() {
|
||||
%m = alloca i32
|
||||
call i32 @callee_default(i32* %m)
|
||||
call fastcc i32 @callee_fastcc(i32* %m)
|
||||
call coldcc i32 @callee_coldcc(i32* %m)
|
||||
call i32 @callee(i32* %m)
|
||||
ret void
|
||||
}
|
||||
|
||||
; COLDCC-LABEL: define void @caller()
|
||||
; COLDCC: call coldcc i32 @callee_default
|
||||
; COLDCC: call fastcc i32 @callee_fastcc
|
||||
; COLDCC: call coldcc i32 @callee_coldcc
|
||||
; COLDCC: call i32 @callee
|
||||
; CHECK-LABEL: define void @caller()
|
||||
; CHECK: call fastcc i32 @callee_default
|
||||
; CHECK: call fastcc i32 @callee_fastcc
|
||||
; CHECK: call coldcc i32 @callee_coldcc
|
||||
; CHECK: call i32 @callee
|
Loading…
Reference in New Issue
Block a user