diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 2d8838c520b..197b8d1eb47 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -792,6 +792,12 @@ public: return false; } + /// getMaximalGlobalOffset - Returns the maximal possible offset which can be + /// used for loads / stores from the global. + virtual unsigned getMaximalGlobalOffset() const { + return 0; + } + //===--------------------------------------------------------------------===// // TargetLowering Optimization Methods // diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h index 14825a78564..233706735e3 100644 --- a/lib/Target/ARM/ARM.h +++ b/lib/Target/ARM/ARM.h @@ -98,6 +98,7 @@ FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM, FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false); FunctionPass *createARMExpandPseudoPass(); +FunctionPass *createARMGlobalMergePass(const TargetLowering* tli); FunctionPass *createARMConstantIslandPass(); FunctionPass *createNEONPreAllocPass(); FunctionPass *createNEONMoveFixPass(); diff --git a/lib/Target/ARM/ARMGlobalMerge.cpp b/lib/Target/ARM/ARMGlobalMerge.cpp new file mode 100644 index 00000000000..3cd6519bbfd --- /dev/null +++ b/lib/Target/ARM/ARMGlobalMerge.cpp @@ -0,0 +1,203 @@ +//===-- ARMGlobalMerge.cpp - Internal globals merging --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// This pass merges globals with internal linkage into one. This way all the +// globals which were merged into a biggest one can be addressed using offsets +// from the same base pointer (no need for separate base pointer for each of the +// global). Such a transformation can significantly reduce the register pressure +// when many globals are involved. +// +// For example, consider the code which touches several global variables at once: +// +// static int foo[N], bar[N], baz[N]; +// +// for (i = 0; i < N; ++i) { +// foo[i] = bar[i] * baz[i]; +// } +// +// On ARM the addresses of 3 arrays should be kept in the registers, thus +// this code has quite large register pressure (loop body): +// +// ldr r1, [r5], #4 +// ldr r2, [r6], #4 +// mul r1, r2, r1 +// str r1, [r0], #4 +// +// Pass converts the code to something like: +// +// static struct { +// int foo[N]; +// int bar[N]; +// int baz[N]; +// } merged; +// +// for (i = 0; i < N; ++i) { +// merged.foo[i] = merged.bar[i] * merged.baz[i]; +// } +// +// and in ARM code this becomes: +// +// ldr r0, [r5, #40] +// ldr r1, [r5, #80] +// mul r0, r1, r0 +// str r0, [r5], #4 +// +// note that we saved 2 registers here almostly "for free". +// ===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "arm-global-merge" +#include "ARM.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/Attributes.h" +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/GlobalVariable.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLowering.h" +using namespace llvm; + +namespace { + class LLVM_LIBRARY_VISIBILITY ARMGlobalMerge : public FunctionPass { + /// TLI - Keep a pointer of a TargetLowering to consult for determining + /// target type sizes. + const TargetLowering *TLI; + + bool doMerge(SmallVectorImpl &Globals, + Module &M, bool) const; + + public: + static char ID; // Pass identification, replacement for typeid. + explicit ARMGlobalMerge(const TargetLowering *tli) + : FunctionPass(&ID), TLI(tli) {} + + virtual bool doInitialization(Module &M); + virtual bool runOnFunction(Function& F); + + const char *getPassName() const { + return "Merge internal globals"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + } + + struct GlobalCmp { + const TargetData *TD; + + GlobalCmp(const TargetData *td): + TD(td) { }; + + bool operator() (const GlobalVariable* GV1, + const GlobalVariable* GV2) { + const Type* Ty1 = cast(GV1->getType())->getElementType(); + const Type* Ty2 = cast(GV2->getType())->getElementType(); + + return (TD->getTypeAllocSize(Ty1) < TD->getTypeAllocSize(Ty2)); + } + }; + }; +} // end anonymous namespace + +char ARMGlobalMerge::ID = 0; + +bool ARMGlobalMerge::doMerge(SmallVectorImpl &Globals, + Module &M, bool isConst) const { + const TargetData *TD = TLI->getTargetData(); + + // FIXME: Infer the maximum possible offset depending on the actual users + // (these max offsets are different for the users inside Thumb or ARM + // functions) + unsigned MaxOffset = TLI->getMaximalGlobalOffset(); + + // FIXME: Find better heuristics + std::stable_sort(Globals.begin(), Globals.end(), GlobalCmp(TD)); + + const Type *Int32Ty = Type::getInt32Ty(M.getContext()); + + for (size_t i = 0, e = Globals.size(); i != e; ) { + size_t j = 0; + uint64_t MergedSize = 0; + std::vector Tys; + std::vector Inits; + for (j = i; MergedSize < MaxOffset && j != e; ++j) { + const Type* Ty = Globals[j]->getType()->getElementType(); + Tys.push_back(Ty); + Inits.push_back(Globals[j]->getInitializer()); + MergedSize += TD->getTypeAllocSize(Ty); + } + + StructType* MergedTy = StructType::get(M.getContext(), Tys); + Constant* MergedInit = ConstantStruct::get(MergedTy, Inits); + GlobalVariable* MergedGV = new GlobalVariable(M, MergedTy, isConst, + GlobalValue::InternalLinkage, + MergedInit, "merged"); + for (size_t k = i; k < j; ++k) { + SmallVector Idx; + Idx.push_back(ConstantInt::get(Int32Ty, 0)); + Idx.push_back(ConstantInt::get(Int32Ty, k-i)); + + Constant* GEP = + ConstantExpr::getInBoundsGetElementPtr(MergedGV, + &Idx[0], Idx.size()); + + Globals[k]->replaceAllUsesWith(GEP); + Globals[k]->eraseFromParent(); + } + i = j; + } + + return true; +} + + +bool ARMGlobalMerge::doInitialization(Module& M) { + SmallVector Globals, ConstGlobals; + const TargetData *TD = TLI->getTargetData(); + unsigned MaxOffset = TLI->getMaximalGlobalOffset(); + bool Changed = false; + + // Grab all non-const globals. + for (Module::global_iterator I = M.global_begin(), + E = M.global_end(); I != E; ++I) { + // Merge is safe for "normal" internal globals only + if (!I->hasLocalLinkage() || I->isThreadLocal() || I->hasSection()) + continue; + + // Ignore fancy-aligned globals for now. + if (I->getAlignment() != 0) + continue; + + if (TD->getTypeAllocSize(I->getType()) < MaxOffset) { + if (I->isConstant()) + ConstGlobals.push_back(I); + else + Globals.push_back(I); + } + } + + if (Globals.size() > 1) + Changed |= doMerge(Globals, M, false); + if (ConstGlobals.size() > 1) + Changed |= doMerge(ConstGlobals, M, true); + + return Changed; +} + +bool ARMGlobalMerge::runOnFunction(Function& F) { + return false; +} + +FunctionPass *llvm::createARMGlobalMergePass(const TargetLowering *tli) { + return new ARMGlobalMerge(tli); +} diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 35ca389bf33..56d24400f27 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -703,6 +703,12 @@ unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const { return getTargetMachine().getSubtarget().isThumb() ? 1 : 2; } +/// getMaximalGlobalOffset - Returns the maximal possible offset which can +/// be used for loads / stores from the global. +unsigned ARMTargetLowering::getMaximalGlobalOffset() const { + return (Subtarget->isThumb1Only() ? 127 : 4095); +} + Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { unsigned NumVals = N->getNumValues(); if (!NumVals) diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index b544b5eee2b..3fae7b73fd0 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -263,6 +263,10 @@ namespace llvm { /// getFunctionAlignment - Return the Log2 alignment of this function. virtual unsigned getFunctionAlignment(const Function *F) const; + /// getMaximalGlobalOffset - Returns the maximal possible offset which can + /// be used for loads / stores from the global. + virtual unsigned getMaximalGlobalOffset() const; + /// createFastISel - This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo) const; diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index 09203f9304d..9d7bf75cf17 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -85,9 +85,15 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT, TSInfo(*this) { } - - // Pass Pipeline Configuration +bool ARMBaseTargetMachine::addPreISel(PassManagerBase &PM, + CodeGenOpt::Level OptLevel) { + if (OptLevel != CodeGenOpt::None) + PM.add(createARMGlobalMergePass(getTargetLowering())); + + return false; +} + bool ARMBaseTargetMachine::addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel) { PM.add(createARMISelDag(*this, OptLevel)); diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index a222e57b13f..17e5425a9d3 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -50,6 +50,7 @@ public: } // Pass Pipeline Configuration + virtual bool addPreISel(PassManagerBase &PM, CodeGenOpt::Level OptLevel); virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel); virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel); virtual bool addPreSched2(PassManagerBase &PM, CodeGenOpt::Level OptLevel);