diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index cd89235e474..a158f0a8f5b 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -317,6 +317,7 @@ namespace { // Helper fuctions bool processStore(StoreInst *SI, BasicBlock::iterator &BBI); bool processMemCpy(MemCpyInst *M); + bool processMemMove(MemMoveInst *M); bool performCallSlotOptzn(MemCpyInst *cpy, CallInst *C); bool iterateOnFunction(Function &F); }; @@ -431,9 +432,8 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { BasicBlock::iterator InsertPt = BI; if (MemSetF == 0) { - const Type *Tys[] = {Type::getInt64Ty(SI->getContext())}; - MemSetF = Intrinsic::getDeclaration(M, Intrinsic::memset, - Tys, 1); + const Type *Ty = Type::getInt64Ty(SI->getContext()); + MemSetF = Intrinsic::getDeclaration(M, Intrinsic::memset, &Ty, 1); } // Get the starting pointer of the block. @@ -679,11 +679,10 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { return false; // If all checks passed, then we can transform these memcpy's - const Type *Tys[1]; - Tys[0] = M->getLength()->getType(); + const Type *Ty = M->getLength()->getType(); Function *MemCpyFun = Intrinsic::getDeclaration( M->getParent()->getParent()->getParent(), - M->getIntrinsicID(), Tys, 1); + M->getIntrinsicID(), &Ty, 1); Value *Args[4] = { M->getRawDest(), MDep->getRawSource(), M->getLength(), M->getAlignmentCst() @@ -708,6 +707,36 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { return false; } +/// processMemMove - Transforms memmove calls to memcpy calls when the src/dst +/// are guaranteed not to alias. +bool MemCpyOpt::processMemMove(MemMoveInst *M) { + AliasAnalysis &AA = getAnalysis(); + + // If the memmove is a constant size, use it for the alias query, this allows + // us to optimize things like: memmove(P, P+64, 64); + uint64_t MemMoveSize = ~0ULL; + if (ConstantInt *Len = dyn_cast(M->getLength())) + MemMoveSize = Len->getZExtValue(); + + // See if the pointers alias. + if (AA.alias(M->getRawDest(), MemMoveSize, M->getRawSource(), MemMoveSize) != + AliasAnalysis::NoAlias) + return false; + + DEBUG(errs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n"); + + // If not, then we know we can transform this. + Module *Mod = M->getParent()->getParent()->getParent(); + const Type *Ty = M->getLength()->getType(); + M->setOperand(0, Intrinsic::getDeclaration(Mod, Intrinsic::memcpy, &Ty, 1)); + + // MemDep may have over conservative information about this instruction, just + // conservatively flush it from the cache. + getAnalysis().removeInstruction(M); + return true; +} + + // MemCpyOpt::iterateOnFunction - Executes one iteration of GVN. bool MemCpyOpt::iterateOnFunction(Function &F) { bool MadeChange = false; @@ -723,6 +752,12 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { MadeChange |= processStore(SI, BI); else if (MemCpyInst *M = dyn_cast(I)) MadeChange |= processMemCpy(M); + else if (MemMoveInst *M = dyn_cast(I)) { + if (processMemMove(M)) { + --BI; // Reprocess the new memcpy. + MadeChange = true; + } + } } } diff --git a/test/Transforms/MemCpyOpt/2008-06-01-MemCpy-MemMove.ll b/test/Transforms/MemCpyOpt/2008-06-01-MemCpy-MemMove.ll deleted file mode 100644 index 16d2df4bacb..00000000000 --- a/test/Transforms/MemCpyOpt/2008-06-01-MemCpy-MemMove.ll +++ /dev/null @@ -1,107 +0,0 @@ -; RUN: llvm-as < %s | opt -memcpyopt | llvm-dis | grep {call.*memmove.*arg1.*} -; PR2401 - -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32" -target triple = "i686-pc-linux-gnu" - %struct.Info = type <{ i32, i32, i8*, i8*, i8*, [32 x i8*], i32, [32 x i32], i32, i32, i32, [32 x i32] }> - %struct.S98 = type <{ [31 x double] }> - %struct._IO_FILE = type <{ i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct._IO_FILE*, i32, i32, i32, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i32, i32, [40 x i8] }> - %struct._IO_marker = type <{ %struct._IO_marker*, %struct._IO_FILE*, i32 }> - %struct.anon = type <{ }> - %union.anon = type { } -@info = common global %struct.Info zeroinitializer, align 4 ; <%struct.Info*> [#uses=13] -@fails = common global i32 0, align 4 ; [#uses=37] -@s98 = common global %struct.S98 zeroinitializer, align 4 ; <%struct.S98*> [#uses=2] -@a98 = common global [5 x %struct.S98] zeroinitializer, align 4 ; <[5 x %struct.S98]*> [#uses=5] -@stdout = external global %struct._IO_FILE* ; <%struct._IO_FILE**> [#uses=1] - -declare void @llvm.memmove.i32(i8*, i8*, i32, i32) nounwind - -define void @test98() nounwind { -entry: - %arg = alloca %struct.S98, align 8 ; <%struct.S98*> [#uses=2] - %tmp13 = alloca %struct.S98 ; <%struct.S98*> [#uses=2] - %tmp14 = alloca %struct.S98 ; <%struct.S98*> [#uses=2] - %tmp15 = alloca %struct.S98 ; <%struct.S98*> [#uses=2] - %tmp17 = alloca %struct.S98 ; <%struct.S98*> [#uses=2] - %tmp21 = alloca %struct.S98 ; <%struct.S98*> [#uses=0] - %tmp23 = alloca %struct.S98 ; <%struct.S98*> [#uses=0] - %tmp25 = alloca %struct.S98 ; <%struct.S98*> [#uses=0] - %tmp27 = alloca %struct.S98 ; <%struct.S98*> [#uses=0] - %tmp29 = alloca %struct.S98 ; <%struct.S98*> [#uses=0] - %tmp31 = alloca %struct.S98 ; <%struct.S98*> [#uses=0] - %tmp33 = alloca %struct.S98 ; <%struct.S98*> [#uses=0] - call void @llvm.memset.i32( i8* bitcast (%struct.S98* @s98 to i8*), i8 0, i32 248, i32 4 ) - call void @llvm.memset.i32( i8* bitcast ([5 x %struct.S98]* @a98 to i8*), i8 0, i32 1240, i32 4 ) - call void @llvm.memset.i32( i8* bitcast (%struct.Info* @info to i8*), i8 0, i32 420, i32 4 ) - store i8* bitcast (%struct.S98* @s98 to i8*), i8** getelementptr (%struct.Info* @info, i32 0, i32 2) - store i8* bitcast ([5 x %struct.S98]* @a98 to i8*), i8** getelementptr (%struct.Info* @info, i32 0, i32 3) - store i8* bitcast (%struct.S98* getelementptr ([5 x %struct.S98]* @a98, i32 0, i32 3) to i8*), i8** getelementptr (%struct.Info* @info, i32 0, i32 4) - store i32 248, i32* getelementptr (%struct.Info* @info, i32 0, i32 6) - store i32 4, i32* getelementptr (%struct.Info* @info, i32 0, i32 8) - store i32 4, i32* getelementptr (%struct.Info* @info, i32 0, i32 9) - store i32 4, i32* getelementptr (%struct.Info* @info, i32 0, i32 10) - %tmp = load i32* getelementptr (%struct.Info* @info, i32 0, i32 8) ; [#uses=1] - %sub = add i32 %tmp, -1 ; [#uses=1] - %and = and i32 %sub, ptrtoint (%struct.S98* getelementptr ([5 x %struct.S98]* @a98, i32 0, i32 3) to i32) ; [#uses=1] - %tobool = icmp eq i32 %and, 0 ; [#uses=1] - br i1 %tobool, label %ifend, label %ifthen - -ifthen: ; preds = %entry - %tmp3 = load i32* @fails ; [#uses=1] - %inc = add i32 %tmp3, 1 ; [#uses=1] - store i32 %inc, i32* @fails - br label %ifend - -ifend: ; preds = %ifthen, %entry - store i8* bitcast (double* getelementptr (%struct.S98* @s98, i32 0, i32 0, i32 18) to i8*), i8** getelementptr (%struct.Info* @info, i32 0, i32 5, i32 0) - store i32 8, i32* getelementptr (%struct.Info* @info, i32 0, i32 7, i32 0) - store i32 4, i32* getelementptr (%struct.Info* @info, i32 0, i32 11, i32 0) - store double 0xC1075E4620000000, double* getelementptr (%struct.S98* @s98, i32 0, i32 0, i32 18) - store double 0x410CD219E0000000, double* getelementptr ([5 x %struct.S98]* @a98, i32 0, i32 2, i32 0, i32 18) - store i32 1, i32* getelementptr (%struct.Info* @info, i32 0, i32 0) - store i32 0, i32* getelementptr (%struct.Info* @info, i32 0, i32 1) - %tmp16 = bitcast %struct.S98* %tmp15 to i8* ; [#uses=1] - call void @llvm.memmove.i32( i8* %tmp16, i8* bitcast (%struct.S98* @s98 to i8*), i32 248, i32 4 ) - %tmp18 = bitcast %struct.S98* %tmp17 to i8* ; [#uses=1] - call void @llvm.memmove.i32( i8* %tmp18, i8* bitcast (%struct.S98* getelementptr ([5 x %struct.S98]* @a98, i32 0, i32 2) to i8*), i32 248, i32 4 ) - call void @check98( %struct.S98* sret %tmp14, %struct.S98* byval %tmp15, %struct.S98* getelementptr ([5 x %struct.S98]* @a98, i32 0, i32 1), %struct.S98* byval %tmp17 ) - %tmp19 = bitcast %struct.S98* %tmp13 to i8* ; [#uses=1] - %tmp20 = bitcast %struct.S98* %tmp14 to i8* ; [#uses=1] - call void @llvm.memmove.i32( i8* %tmp19, i8* %tmp20, i32 248, i32 8 ) - %tmp1 = bitcast %struct.S98* %arg to i8* ; [#uses=1] - %tmp2 = bitcast %struct.S98* %tmp13 to i8* ; [#uses=1] - call void @llvm.memcpy.i64( i8* %tmp1, i8* %tmp2, i64 248, i32 8 ) - %arrayidx.i = getelementptr %struct.S98* %arg, i32 0, i32 0, i32 18 ; [#uses=1] - %tmp1.i = load double* %arrayidx.i, align 8 ; [#uses=1] - %tmp2.i = load double* getelementptr (%struct.S98* @s98, i32 0, i32 0, i32 18) ; [#uses=1] - %cmp.i = fcmp une double %tmp1.i, %tmp2.i ; [#uses=1] - br i1 %cmp.i, label %ifthen.i, label %checkx98.exit - -ifthen.i: ; preds = %ifend - %tmp3.i = load i32* @fails ; [#uses=1] - %inc.i = add i32 %tmp3.i, 1 ; [#uses=1] - store i32 %inc.i, i32* @fails - br label %checkx98.exit - -checkx98.exit: ; preds = %ifthen.i, %ifend - ret void -} - -declare void @check98(%struct.S98* sret %agg.result, %struct.S98* byval %arg0, %struct.S98* %arg1, %struct.S98* byval %arg2) nounwind - -declare void @llvm.va_start(i8*) nounwind - -declare void @llvm.va_end(i8*) nounwind - -declare i32 @main() noreturn - -declare i32 @fflush(%struct._IO_FILE*) - -declare void @abort() noreturn nounwind - -declare void @exit(i32) noreturn nounwind - -declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind - -declare void @llvm.memcpy.i64(i8*, i8*, i64, i32) nounwind diff --git a/test/Transforms/MemCpyOpt/memmove.ll b/test/Transforms/MemCpyOpt/memmove.ll new file mode 100644 index 00000000000..64d9db13d2f --- /dev/null +++ b/test/Transforms/MemCpyOpt/memmove.ll @@ -0,0 +1,37 @@ +; RUN: llvm-as < %s | opt -memcpyopt | llvm-dis | FileCheck %s +; These memmoves should get optimized to memcpys. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +target triple = "x86_64-apple-darwin9.0" + +declare void @llvm.memmove.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind + +define i8* @test1(i8* nocapture %src) nounwind { +entry: +; CHECK: @test1 +; CHECK: call void @llvm.memcpy + + %call3 = malloc [13 x i8] ; <[13 x i8]*> [#uses=1] + %call3.sub = getelementptr inbounds [13 x i8]* %call3, i64 0, i64 0 ; [#uses=2] + tail call void @llvm.memmove.i64(i8* %call3.sub, i8* %src, i64 13, i32 1) + ret i8* %call3.sub +} + +define void @test2(i8* %P) nounwind { +entry: +; CHECK: @test2 +; CHECK: call void @llvm.memcpy + %add.ptr = getelementptr i8* %P, i64 16 ; [#uses=1] + tail call void @llvm.memmove.i64(i8* %P, i8* %add.ptr, i64 16, i32 1) + ret void +} + +; This cannot be optimize because the src/dst really do overlap. +define void @test3(i8* %P) nounwind { +entry: +; CHECK: @test3 +; CHECK: call void @llvm.memmove + %add.ptr = getelementptr i8* %P, i64 16 ; [#uses=1] + tail call void @llvm.memmove.i64(i8* %P, i8* %add.ptr, i64 17, i32 1) + ret void +}