diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index 4dbfa2619be..0d9ba26aa14 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -99,6 +99,7 @@ namespace { (void) llvm::createSCCPPass(); (void) llvm::createScalarReplAggregatesPass(); (void) llvm::createSimplifyLibCallsPass(); + (void) llvm::createSimplifyHalfPowrLibCallsPass(); (void) llvm::createSingleLoopExtractorPass(); (void) llvm::createStripSymbolsPass(); (void) llvm::createStripDeadPrototypesPass(); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index 52f7967af2e..2c3fdd4a788 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -317,6 +317,12 @@ Pass *createLoopDeletionPass(); /// specific well-known (library) functions. FunctionPass *createSimplifyLibCallsPass(); +//===----------------------------------------------------------------------===// +// +/// createSimplifyHalfPowrLibCallsPass - This is an experimental pass that +/// optimizes specific half_pow functions. +FunctionPass *createSimplifyHalfPowrLibCallsPass(); + //===----------------------------------------------------------------------===// // // CodeGenPrepare - This pass prepares a function for instruction selection. diff --git a/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp new file mode 100644 index 00000000000..530ad038cbe --- /dev/null +++ b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp @@ -0,0 +1,159 @@ +//===- SimplifyHalfPowrLibCalls.cpp - Optimize specific half_powr calls ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a simple pass that applies an experimental +// transformation on calls to specific functions. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "simplify-libcalls-halfpowr" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Instructions.h" +#include "llvm/Intrinsics.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Target/TargetData.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/Debug.h" +#include "llvm/Config/config.h" +using namespace llvm; + +namespace { + /// This pass optimizes well half_powr function calls. + /// + class VISIBILITY_HIDDEN SimplifyHalfPowrLibCalls : public FunctionPass { + const TargetData *TD; + public: + static char ID; // Pass identification + SimplifyHalfPowrLibCalls() : FunctionPass(&ID) {} + + bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + } + + Instruction * + InlineHalfPowrs(const std::vector &HalfPowrs, + Instruction *InsertPt); + }; + char SimplifyHalfPowrLibCalls::ID = 0; +} // end anonymous namespace. + +static RegisterPass +X("simplify-libcalls-halfpowr", "Simplify half_powr library calls"); + +// Public interface to the Simplify HalfPowr LibCalls pass. +FunctionPass *llvm::createSimplifyHalfPowrLibCallsPass() { + return new SimplifyHalfPowrLibCalls(); +} + +/// InlineHalfPowrs - Inline a sequence of adjacent half_powr calls, rearranging +/// their control flow to better facilitate subsequent optimization. +Instruction * +SimplifyHalfPowrLibCalls::InlineHalfPowrs(const std::vector &HalfPowrs, + Instruction *InsertPt) { + std::vector Bodies; + BasicBlock *NewBlock = 0; + + for (unsigned i = 0, e = HalfPowrs.size(); i != e; ++i) { + CallInst *Call = cast(HalfPowrs[i]); + Function *Callee = Call->getCalledFunction(); + + // Minimally sanity-check the CFG of half_powr to ensure that it contains + // the the kind of code we expect. If we're running this pass, we have + // reason to believe it will be what we expect. + Function::iterator I = Callee->begin(); + BasicBlock *Prologue = I++; + if (I == Callee->end()) break; + BasicBlock *SubnormalHandling = I++; + if (I == Callee->end()) break; + BasicBlock *Body = I++; + if (I != Callee->end()) break; + if (SubnormalHandling->getSinglePredecessor() != Prologue) + break; + BranchInst *PBI = dyn_cast(Prologue->getTerminator()); + if (!PBI || !PBI->isConditional()) + break; + BranchInst *SNBI = dyn_cast(SubnormalHandling->getTerminator()); + if (!SNBI || SNBI->isConditional()) + break; + if (!isa(Body->getTerminator())) + break; + + Instruction *NextInst = next(BasicBlock::iterator(Call)); + + // Inline the call, taking care of what code ends up where. + NewBlock = SplitBlock(NextInst->getParent(), NextInst, this); + + bool B = InlineFunction(Call, 0, TD); + assert(B && "half_powr didn't inline?"); + + BasicBlock *NewBody = NewBlock->getSinglePredecessor(); + assert(NewBody); + Bodies.push_back(NewBody); + } + + if (!NewBlock) + return InsertPt; + + // Put the code for all the bodies into one block, to facilitate + // subsequent optimization. + (void)SplitEdge(NewBlock->getSinglePredecessor(), NewBlock, this); + for (unsigned i = 0, e = Bodies.size(); i != e; ++i) { + BasicBlock *Body = Bodies[i]; + Instruction *FNP = Body->getFirstNonPHI(); + // Splice the insts from body into NewBlock. + NewBlock->getInstList().splice(NewBlock->begin(), Body->getInstList(), + FNP, Body->getTerminator()); + } + + return NewBlock->begin(); +} + +/// runOnFunction - Top level algorithm. +/// +bool SimplifyHalfPowrLibCalls::runOnFunction(Function &F) { + TD = &getAnalysis(); + + bool Changed = false; + std::vector HalfPowrs; + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + // Look for calls. + bool IsHalfPowr = false; + if (CallInst *CI = dyn_cast(I)) { + // Look for direct calls and calls to non-external functions. + Function *Callee = CI->getCalledFunction(); + if (Callee && Callee->hasExternalLinkage()) { + // Look for calls with well-known names. + const char *CalleeName = Callee->getNameStart(); + if (strcmp(CalleeName, "__half_powrf4") == 0) + IsHalfPowr = true; + } + } + if (IsHalfPowr) + HalfPowrs.push_back(I); + // We're looking for sequences of up to three such calls, which we'll + // simplify as a group. + if ((!IsHalfPowr && !HalfPowrs.empty()) || HalfPowrs.size() == 3) { + I = InlineHalfPowrs(HalfPowrs, I); + E = I->getParent()->end(); + HalfPowrs.clear(); + Changed = true; + } + } + assert(HalfPowrs.empty() && "Block had no terminator!"); + } + + return Changed; +} diff --git a/test/Transforms/SimplifyLibCalls/half-powr.ll b/test/Transforms/SimplifyLibCalls/half-powr.ll new file mode 100644 index 00000000000..f4e898c0b23 --- /dev/null +++ b/test/Transforms/SimplifyLibCalls/half-powr.ll @@ -0,0 +1,41 @@ +; RUN: llvm-as < %s | opt -simplify-libcalls-halfpowr | llvm-dis | %prcontext {mul float} 1 | grep {mul float} | count 8 + +define float @__half_powrf4(float %f, float %g) nounwind readnone { +entry: + %0 = fcmp olt float %f, 2.000000e+00 ; [#uses=1] + br i1 %0, label %bb, label %bb1 + +bb: ; preds = %entry + %1 = fdiv float %f, 3.000000e+00 ; [#uses=1] + br label %bb1 + +bb1: ; preds = %bb, %entry + %f_addr.0 = phi float [ %1, %bb ], [ %f, %entry ] ; [#uses=1] + %2 = mul float %f_addr.0, %g ; [#uses=1] + ret float %2 +} + +define void @foo(float* %p) nounwind { +entry: + %0 = load float* %p, align 4 ; [#uses=1] + %1 = getelementptr float* %p, i32 1 ; [#uses=1] + %2 = load float* %1, align 4 ; [#uses=1] + %3 = getelementptr float* %p, i32 2 ; [#uses=1] + %4 = load float* %3, align 4 ; [#uses=1] + %5 = getelementptr float* %p, i32 3 ; [#uses=1] + %6 = load float* %5, align 4 ; [#uses=1] + %7 = getelementptr float* %p, i32 4 ; [#uses=1] + %8 = load float* %7, align 4 ; [#uses=1] + %9 = getelementptr float* %p, i32 5 ; [#uses=1] + %10 = load float* %9, align 4 ; [#uses=1] + %11 = tail call float @__half_powrf4(float %0, float %6) nounwind ; [#uses=1] + %12 = tail call float @__half_powrf4(float %2, float %8) nounwind ; [#uses=1] + %13 = tail call float @__half_powrf4(float %4, float %10) nounwind ; [#uses=1] + %14 = getelementptr float* %p, i32 6 ; [#uses=1] + store float %11, float* %14, align 4 + %15 = getelementptr float* %p, i32 7 ; [#uses=1] + store float %12, float* %15, align 4 + %16 = getelementptr float* %p, i32 8 ; [#uses=1] + store float %13, float* %16, align 4 + ret void +}