diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 41a23ed6c7c..a4f78fe4573 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Function.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Instructions.h"
+#include "llvm/ParameterAttributes.h"
 #include "llvm/Value.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
@@ -738,6 +739,8 @@ namespace {
     bool processNonLocalLoad(LoadInst* L,
                              SmallVector<Instruction*, 4>& toErase);
     bool processMemCpy(MemCpyInst* M, SmallVector<Instruction*, 4>& toErase);
+    bool performReturnSlotOptzn(MemCpyInst* cpy, CallInst* C,
+                                SmallVector<Instruction*, 4>& toErase);
     Value *GetValueForBlock(BasicBlock *BB, LoadInst* orig,
                             DenseMap<BasicBlock*, Value*> &Phis,
                             bool top_level = false);
@@ -1048,6 +1051,62 @@ bool GVN::processLoad(LoadInst* L,
   return deletedLoad;
 }
 
+/// performReturnSlotOptzn - takes a memcpy and a call that it depends on,
+/// and checks for the possibility of a return slot optimization by having
+/// the call write its result directly into the callees return parameter
+/// rather than using memcpy
+bool GVN::performReturnSlotOptzn(MemCpyInst* cpy, CallInst* C,
+                                 SmallVector<Instruction*, 4>& toErase) {
+  // Check that we're copying to an argument...
+  Value* cpyDest = cpy->getDest();
+  if (!isa<Argument>(cpyDest))
+    return false;
+  
+  // And that the argument is the return slot
+  Argument* sretArg = cast<Argument>(cpyDest);
+  if (!sretArg->hasStructRetAttr())
+    return false;
+  
+  // Make sure the return slot is otherwise dead
+  std::set<User*> useList(sretArg->use_begin(), sretArg->use_end());
+  while (!useList.empty()) {
+    User* UI = *useList.begin();
+    
+    if (isa<GetElementPtrInst>(UI) || isa<BitCastInst>(UI)) {
+      useList.insert(UI->use_begin(), UI->use_end());
+      useList.erase(UI);
+    } else if (UI == cpy)
+      useList.erase(UI);
+    else
+      return false;
+  }
+  
+  // Make sure the call cannot modify the return slot in some unpredicted way
+  AliasAnalysis& AA = getAnalysis<AliasAnalysis>();
+  if (AA.getModRefInfo(C, cpy->getRawDest(), ~0UL) != AliasAnalysis::NoModRef)
+    return false;
+  
+  // If all checks passed, then we can perform the transformation
+  CallSite CS = CallSite::get(C);
+  for (unsigned i = 0; i < CS.arg_size(); ++i) {
+    if (CS.paramHasAttr(i+1, ParamAttr::StructRet)) {
+      if (CS.getArgument(i)->getType() != cpyDest->getType())
+        return false;
+      
+      CS.setArgument(i, cpyDest);
+      break;
+    }
+  }
+  
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+  MD.dropInstruction(C);
+  
+  // Remove the memcpy
+  toErase.push_back(cpy);
+  
+  return true;
+}
+
 /// processMemCpy - perform simplication of memcpy's.  If we have memcpy A which
 /// copies X to Y, and memcpy B which copies Y to Z, then we can rewrite B to be
 /// a memcpy from X to Z (or potentially a memmove, depending on circumstances).
@@ -1059,9 +1118,14 @@ bool GVN::processMemCpy(MemCpyInst* M,
   // First, we have to check that the dependency is another memcpy
   Instruction* dep = MD.getDependency(M);
   if  (dep == MemoryDependenceAnalysis::None ||
-       dep == MemoryDependenceAnalysis::NonLocal ||
-       !isa<MemCpyInst>(dep))
+       dep == MemoryDependenceAnalysis::NonLocal)
     return false;
+  else if (!isa<MemCpyInst>(dep)) {
+    if (CallInst* C = dyn_cast<CallInst>(dep))
+      return performReturnSlotOptzn(M, C, toErase);
+    else
+      return false;
+  }
   
   // We can only transforms memcpy's where the dest of one is the source of the
   // other
diff --git a/test/Transforms/GVN/sret.ll b/test/Transforms/GVN/sret.ll
new file mode 100644
index 00000000000..9ae73eff387
--- /dev/null
+++ b/test/Transforms/GVN/sret.ll
@@ -0,0 +1,28 @@
+; RUN: llvm-as < %s | opt -gvn | llvm-dis | grep memcpy | count 1
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i686-apple-darwin9"
+
+define void @ccosl({ x86_fp80, x86_fp80 }* noalias sret  %agg.result, { x86_fp80, x86_fp80 }* byval  %z) nounwind  {
+entry:
+	%iz = alloca { x86_fp80, x86_fp80 }		; <{ x86_fp80, x86_fp80 }*> [#uses=3]
+	%memtmp = alloca { x86_fp80, x86_fp80 }, align 16		; <{ x86_fp80, x86_fp80 }*> [#uses=2]
+	%tmp1 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 1		; <x86_fp80*> [#uses=1]
+	%tmp2 = load x86_fp80* %tmp1, align 16		; <x86_fp80> [#uses=1]
+	%tmp3 = sub x86_fp80 0xK80000000000000000000, %tmp2		; <x86_fp80> [#uses=1]
+	%tmp4 = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 1		; <x86_fp80*> [#uses=1]
+	%real = getelementptr { x86_fp80, x86_fp80 }* %iz, i32 0, i32 0		; <x86_fp80*> [#uses=1]
+	%tmp7 = getelementptr { x86_fp80, x86_fp80 }* %z, i32 0, i32 0		; <x86_fp80*> [#uses=1]
+	%tmp8 = load x86_fp80* %tmp7, align 16		; <x86_fp80> [#uses=1]
+	store x86_fp80 %tmp3, x86_fp80* %real, align 16
+	store x86_fp80 %tmp8, x86_fp80* %tmp4, align 16
+	call void @ccoshl( { x86_fp80, x86_fp80 }* noalias sret  %memtmp, { x86_fp80, x86_fp80 }* byval  %iz ) nounwind 
+	%memtmp14 = bitcast { x86_fp80, x86_fp80 }* %memtmp to i8*		; <i8*> [#uses=1]
+	%agg.result15 = bitcast { x86_fp80, x86_fp80 }* %agg.result to i8*		; <i8*> [#uses=1]
+	call void @llvm.memcpy.i32( i8* %agg.result15, i8* %memtmp14, i32 32, i32 16 )
+	ret void
+}
+
+declare void @ccoshl({ x86_fp80, x86_fp80 }* noalias sret , { x86_fp80, x86_fp80 }* byval ) nounwind 
+
+declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind