diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index 41a23ed6c7c..a4f78fe4573 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -21,6 +21,7 @@ #include "llvm/Function.h" #include "llvm/IntrinsicInst.h" #include "llvm/Instructions.h" +#include "llvm/ParameterAttributes.h" #include "llvm/Value.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" @@ -738,6 +739,8 @@ namespace { bool processNonLocalLoad(LoadInst* L, SmallVector& toErase); bool processMemCpy(MemCpyInst* M, SmallVector& toErase); + bool performReturnSlotOptzn(MemCpyInst* cpy, CallInst* C, + SmallVector& toErase); Value *GetValueForBlock(BasicBlock *BB, LoadInst* orig, DenseMap &Phis, bool top_level = false); @@ -1048,6 +1051,62 @@ bool GVN::processLoad(LoadInst* L, return deletedLoad; } +/// performReturnSlotOptzn - takes a memcpy and a call that it depends on, +/// and checks for the possibility of a return slot optimization by having +/// the call write its result directly into the callees return parameter +/// rather than using memcpy +bool GVN::performReturnSlotOptzn(MemCpyInst* cpy, CallInst* C, + SmallVector& toErase) { + // Check that we're copying to an argument... + Value* cpyDest = cpy->getDest(); + if (!isa(cpyDest)) + return false; + + // And that the argument is the return slot + Argument* sretArg = cast(cpyDest); + if (!sretArg->hasStructRetAttr()) + return false; + + // Make sure the return slot is otherwise dead + std::set useList(sretArg->use_begin(), sretArg->use_end()); + while (!useList.empty()) { + User* UI = *useList.begin(); + + if (isa(UI) || isa(UI)) { + useList.insert(UI->use_begin(), UI->use_end()); + useList.erase(UI); + } else if (UI == cpy) + useList.erase(UI); + else + return false; + } + + // Make sure the call cannot modify the return slot in some unpredicted way + AliasAnalysis& AA = getAnalysis(); + if (AA.getModRefInfo(C, cpy->getRawDest(), ~0UL) != AliasAnalysis::NoModRef) + return false; + + // If all checks passed, then we can perform the transformation + CallSite CS = CallSite::get(C); + for (unsigned i = 0; i < CS.arg_size(); ++i) { + if (CS.paramHasAttr(i+1, ParamAttr::StructRet)) { + if (CS.getArgument(i)->getType() != cpyDest->getType()) + return false; + + CS.setArgument(i, cpyDest); + break; + } + } + + MemoryDependenceAnalysis& MD = getAnalysis(); + MD.dropInstruction(C); + + // Remove the memcpy + toErase.push_back(cpy); + + return true; +} + /// processMemCpy - perform simplication of memcpy's. If we have memcpy A which /// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be /// a memcpy from X to Z (or potentially a memmove, depending on circumstances). @@ -1059,9 +1118,14 @@ bool GVN::processMemCpy(MemCpyInst* M, // First, we have to check that the dependency is another memcpy Instruction* dep = MD.getDependency(M); if (dep == MemoryDependenceAnalysis::None || - dep == MemoryDependenceAnalysis::NonLocal || - !isa(dep)) + dep == MemoryDependenceAnalysis::NonLocal) return false; + else if (!isa(dep)) { + if (CallInst* C = dyn_cast(dep)) + return performReturnSlotOptzn(M, C, toErase); + else + return false; + } // We can only transforms memcpy's where the dest of one is the source of the // other diff --git a/test/Transforms/GVN/sret.ll b/test/Transforms/GVN/sret.ll new file mode 100644 index 00000000000..9ae73eff387 --- /dev/null +++ b/test/Transforms/GVN/sret.ll @@ -0,0 +1,28 @@ +; RUN: llvm-as < %s | opt -gvn | llvm-dis | grep memcpy | count 1 + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" +target triple = "i686-apple-darwin9" + +define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval %z) nounwind { +entry: + %iz = alloca { x86_fp80, x86_fp80 } ; <{ x86_fp80, x86_fp80 }*> [#uses=3] + %memtmp = alloca { x86_fp80, x86_fp80 }, align 16 ; <{ x86_fp80, x86_fp80 }*> [#uses=2] + %tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1 ; [#uses=1] + %tmp2 = load x86_fp80* %tmp1, align 16 ; [#uses=1] + %tmp3 = sub x86_fp80 0xK80000000000000000000, %tmp2 ; [#uses=1] + %tmp4 = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 1 ; [#uses=1] + %real = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 0 ; [#uses=1] + %tmp7 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 0 ; [#uses=1] + %tmp8 = load x86_fp80* %tmp7, align 16 ; [#uses=1] + store x86_fp80 %tmp3, x86_fp80* %real, align 16 + store x86_fp80 %tmp8, x86_fp80* %tmp4, align 16 + call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret %memtmp, { x86_fp80, x86_fp80 }* byval %iz ) nounwind + %memtmp14 = bitcast { x86_fp80, x86_fp80 }* %memtmp to i8* ; [#uses=1] + %agg.result15 = bitcast { x86_fp80, x86_fp80 }* %agg.result to i8* ; [#uses=1] + call void @llvm.memcpy.i32( i8* %agg.result15, i8* %memtmp14, i32 32, i32 16 ) + ret void +} + +declare void @ccoshl({ x86_fp80, x86_fp80 }* noalias sret , { x86_fp80, x86_fp80 }* byval ) nounwind + +declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind