diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 51925bde82f..c81d1f76d01 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -642,21 +642,30 @@ public: /// This function returns the maximum number of store operations permitted /// to replace a call to llvm.memset. The value is set by the target at the - /// performance threshold for such a replacement. + /// performance threshold for such a replacement. If OptSize is true, + /// return the limit for functions that have OptSize attribute. /// @brief Get maximum # of store operations permitted for llvm.memset - unsigned getMaxStoresPerMemset() const { return maxStoresPerMemset; } + unsigned getMaxStoresPerMemset(bool OptSize) const { + return OptSize ? maxStoresPerMemsetOptSize : maxStoresPerMemset; + } /// This function returns the maximum number of store operations permitted /// to replace a call to llvm.memcpy. The value is set by the target at the - /// performance threshold for such a replacement. + /// performance threshold for such a replacement. If OptSize is true, + /// return the limit for functions that have OptSize attribute. /// @brief Get maximum # of store operations permitted for llvm.memcpy - unsigned getMaxStoresPerMemcpy() const { return maxStoresPerMemcpy; } + unsigned getMaxStoresPerMemcpy(bool OptSize) const { + return OptSize ? maxStoresPerMemcpyOptSize : maxStoresPerMemcpy; + } /// This function returns the maximum number of store operations permitted /// to replace a call to llvm.memmove. The value is set by the target at the - /// performance threshold for such a replacement. + /// performance threshold for such a replacement. If OptSize is true, + /// return the limit for functions that have OptSize attribute. /// @brief Get maximum # of store operations permitted for llvm.memmove - unsigned getMaxStoresPerMemmove() const { return maxStoresPerMemmove; } + unsigned getMaxStoresPerMemmove(bool OptSize) const { + return OptSize ? maxStoresPerMemmoveOptSize : maxStoresPerMemmove; + } /// This function returns true if the target allows unaligned memory accesses. /// of the specified type. This is used, for example, in situations where an @@ -1776,6 +1785,10 @@ protected: /// @brief Specify maximum number of store instructions per memset call. unsigned maxStoresPerMemset; + /// Maximum number of stores operations that may be substituted for the call + /// to memset, used for functions with OptSize attribute. + unsigned maxStoresPerMemsetOptSize; + /// When lowering \@llvm.memcpy this field specifies the maximum number of /// store operations that may be substituted for a call to memcpy. Targets /// must set this value based on the cost threshold for that target. Targets @@ -1788,6 +1801,10 @@ protected: /// @brief Specify maximum bytes of store instructions per memcpy call. unsigned maxStoresPerMemcpy; + /// Maximum number of store operations that may be substituted for a call + /// to memcpy, used for functions with OptSize attribute. + unsigned maxStoresPerMemcpyOptSize; + /// When lowering \@llvm.memmove this field specifies the maximum number of /// store instructions that may be substituted for a call to memmove. Targets /// must set this value based on the cost threshold for that target. Targets @@ -1799,6 +1816,10 @@ protected: /// @brief Specify maximum bytes of store instructions per memmove call. unsigned maxStoresPerMemmove; + /// Maximum number of store instructions that may be substituted for a call + /// to memmove, used for functions with OpSize attribute. + unsigned maxStoresPerMemmoveOptSize; + /// This field specifies whether the target can benefit from code placement /// optimization. bool benefitFromCodePlacementOpt; diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 110812c4371..27dd4ea1459 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3281,15 +3281,6 @@ static bool FindOptimalMemOpLowering(std::vector &MemOps, VT = LVT; } - // If we're optimizing for size, and there is a limit, bump the maximum number - // of operations inserted down to 4. This is a wild guess that approximates - // the size of a call to memcpy or memset (3 arguments + call). - if (Limit != ~0U) { - const Function *F = DAG.getMachineFunction().getFunction(); - if (F->hasFnAttr(Attribute::OptimizeForSize)) - Limit = 4; - } - unsigned NumMemOps = 0; while (Size != 0) { unsigned VTSize = VT.getSizeInBits() / 8; @@ -3335,7 +3326,9 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); std::vector MemOps; bool DstAlignCanChange = false; - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + bool OptSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); FrameIndexSDNode *FI = dyn_cast(Dst); if (FI && !MFI->isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; @@ -3345,7 +3338,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, std::string Str; bool CopyFromStr = isMemSrcFromString(Src, Str); bool isZeroStr = CopyFromStr && Str.empty(); - unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(); + unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize); if (!FindOptimalMemOpLowering(MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align), @@ -3426,14 +3419,16 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); std::vector MemOps; bool DstAlignCanChange = false; - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + bool OptSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); FrameIndexSDNode *FI = dyn_cast(Dst); if (FI && !MFI->isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; unsigned SrcAlign = DAG.InferPtrAlignment(Src); if (Align > SrcAlign) SrcAlign = Align; - unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(); + unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize); if (!FindOptimalMemOpLowering(MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align), @@ -3502,13 +3497,15 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); std::vector MemOps; bool DstAlignCanChange = false; - MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); + MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo *MFI = MF.getFrameInfo(); + bool OptSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize); FrameIndexSDNode *FI = dyn_cast(Dst); if (FI && !MFI->isFixedObjectIndex(FI->getIndex())) DstAlignCanChange = true; bool NonScalarIntSafe = isa(Src) && cast(Src)->isNullValue(); - if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(), + if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize), Size, (DstAlignCanChange ? 0 : Align), 0, NonScalarIntSafe, false, DAG, TLI)) return SDValue(); diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 1300df8cbae..396ebc15c6d 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -567,6 +567,8 @@ TargetLowering::TargetLowering(const TargetMachine &tm, memset(RegClassForVT, 0,MVT::LAST_VALUETYPE*sizeof(TargetRegisterClass*)); memset(TargetDAGCombineArray, 0, array_lengthof(TargetDAGCombineArray)); maxStoresPerMemset = maxStoresPerMemcpy = maxStoresPerMemmove = 8; + maxStoresPerMemsetOptSize = maxStoresPerMemcpyOptSize + = maxStoresPerMemmoveOptSize = 4; benefitFromCodePlacementOpt = false; UseUnderscoreSetJmp = false; UseUnderscoreLongJmp = false; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 22b3b431deb..f50eac523f4 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -687,7 +687,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM) else setSchedulingPreference(Sched::Hybrid); - maxStoresPerMemcpy = 1; //// temporary - rewrite interface to use type + //// temporary - rewrite interface to use type + maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 1; // On ARM arguments smaller than 4 bytes are extended, so all arguments // are at least 4 bytes aligned. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index fca7b02c40a..f871b5a7701 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -978,11 +978,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) computeRegisterProperties(); - // FIXME: These should be based on subtarget info. Plus, the values should - // be smaller when we are in optimizing for size mode. + // On Darwin, -Os means optimize for size without hurting performance, + // do not reduce the limit. maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores + maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores - maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores + maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; + maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores + maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; setPrefLoopAlignment(16); benefitFromCodePlacementOpt = true; } diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp index 11840a19568..d1c40ac81f8 100644 --- a/lib/Target/XCore/XCoreISelLowering.cpp +++ b/lib/Target/XCore/XCoreISelLowering.cpp @@ -149,8 +149,9 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM) setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); - maxStoresPerMemset = 4; - maxStoresPerMemmove = maxStoresPerMemcpy = 2; + maxStoresPerMemset = maxStoresPerMemsetOptSize = 4; + maxStoresPerMemmove = maxStoresPerMemmoveOptSize + = maxStoresPerMemcpy = maxStoresPerMemcpyOptSize = 2; // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::STORE); diff --git a/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll b/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll index c6421a247ea..6db3ce1f42c 100644 --- a/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll +++ b/test/CodeGen/X86/2010-06-25-CoalescerSubRegDefDead.ll @@ -1,4 +1,4 @@ -; RUN: llc -O1 -mtriple=x86_64-apple-darwin10 -relocation-model=pic -disable-fp-elim < %s | FileCheck %s +; RUN: llc -O1 -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic -disable-fp-elim < %s | FileCheck %s ; %struct.type = type { %struct.subtype*, i32, i8, i32, i8, i32, i32, i32, i32, i32, i8, i32, i32, i32, i32, i32, [256 x i32], i32, [257 x i32], [257 x i32], i32*, i16*, i8*, i32, i32, i32, i32, i32, [256 x i8], [16 x i8], [256 x i8], [4096 x i8], [16 x i32], [18002 x i8], [18002 x i8], [6 x [258 x i8]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x [258 x i32]], [6 x i32], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32*, i32*, i32* } @@ -21,9 +21,9 @@ bb: ; statement. It can be an ADD or LEA instruction, it's not important which one ; it is. ; -; CHECK: ## %bb -; CHECK-NEXT: addq $64036, %rdi -; CHECK: rep;stosl +; CHECK: # %bb +; CHECK: addq $64036, %rdi +; CHECK: rep;stosl %tmp5 = bitcast i32* %tmp4 to i8* call void @llvm.memset.p0i8.i64(i8* %tmp5, i8 0, i64 84, i32 4, i1 false) diff --git a/test/CodeGen/X86/memcpy.ll b/test/CodeGen/X86/memcpy.ll index 7bc31bec163..72342cbacb4 100644 --- a/test/CodeGen/X86/memcpy.ll +++ b/test/CodeGen/X86/memcpy.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -march=x86-64 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=LINUX +; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=DARWIN declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind @@ -9,8 +10,8 @@ entry: tail call void @llvm.memcpy.p0i8.p0i8.i64( i8* %a, i8* %b, i64 %n, i32 1, i1 0 ) ret i8* %a -; CHECK: test1: -; CHECK: memcpy +; LINUX: test1: +; LINUX: memcpy } ; Variable memcpy's should lower to calls. @@ -21,18 +22,41 @@ entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp14, i8* %tmp25, i64 %n, i32 8, i1 0 ) ret i8* %tmp14 -; CHECK: test2: -; CHECK: memcpy +; LINUX: test2: +; LINUX: memcpy } ; Large constant memcpy's should lower to a call when optimizing for size. ; PR6623 + +; On the other hand, Darwin's definition of -Os is optimizing for size without +; hurting performance so it should just ignore optsize when expanding memcpy. +; rdar://8821501 define void @test3(i8* nocapture %A, i8* nocapture %B) nounwind optsize noredzone { entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i32 1, i1 false) ret void -; CHECK: test3: -; CHECK: memcpy +; LINUX: test3: +; LINUX: memcpy + +; DARWIN: test3: +; DARWIN-NOT: memcpy +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq +; DARWIN: movq } ; Large constant memcpy's should be inlined when not optimizing for size. @@ -40,18 +64,18 @@ define void @test4(i8* nocapture %A, i8* nocapture %B) nounwind noredzone { entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %A, i8* %B, i64 64, i32 1, i1 false) ret void -; CHECK: test4: -; CHECK: movq -; CHECK: movq -; CHECK: movq -; CHECK: movq -; CHECK: movq -; CHECK: movq -; CHECK: movq -; CHECK: movq -; CHECK: movq -; CHECK: movq -; CHECK: movq -; CHECK: movq +; LINUX: test4: +; LINUX movq +; LINUX movq +; LINUX movq +; LINUX movq +; LINUX movq +; LINUX movq +; LINUX movq +; LINUX movq +; LINUX movq +; LINUX movq +; LINUX movq +; LINUX movq }