From d28587407f81b084bc59abe851f31ee1787e834b Mon Sep 17 00:00:00 2001 From: Hal Finkel Date: Tue, 28 Aug 2012 02:10:27 +0000 Subject: [PATCH] Eliminate redundant CR moves on PPC32. The 32-bit ABI requires CR bit 6 to be set if the call has fp arguments and unset if it doesn't. The solution up to now was to insert a MachineNode to set/unset the CR bit, which produces a CR vreg. This vreg was then copied into CR bit 6. When the register allocator saw a bunch of these in the same function, it allocated the set/unset CR bit in some random CR register (1 extra instruction) and then emitted CR moves before every vararg function call, rather than just setting and unsetting CR bit 6 directly before every vararg function call. This patch instead inserts a PPCcrset/PPCcrunset instruction which are then matched by a dedicated instruction pattern. Patch by Tobias von Koch. llvm-svn: 162725 --- lib/Target/PowerPC/PPCISelLowering.cpp | 22 +++++++++++------ lib/Target/PowerPC/PPCISelLowering.h | 4 +++ lib/Target/PowerPC/PPCInstrInfo.td | 16 ++++++++++++ test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll | 26 ++++++++++++++++++++ 4 files changed, 60 insertions(+), 8 deletions(-) create mode 100644 test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 88078246c1f..9ec100e6904 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -517,6 +517,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ"; case PPCISD::MTFSF: return "PPCISD::MTFSF"; case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN"; + case PPCISD::CR6SET: return "PPCISD::CR6SET"; + case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET"; } } @@ -2834,6 +2836,10 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl, isTailCall, RegsToPass, Ops, NodeTys, PPCSubTarget); + // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls + if (isVarArg && PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64()) + Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32)); + // When performing tail call optimization the callee pops its arguments off // the stack. Account for this here so these bytes can be pushed back on in // PPCRegisterInfo::eliminateCallFramePseudoInstr. @@ -3131,14 +3137,6 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee, Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOpChains[0], MemOpChains.size()); - // Set CR6 to true if this is a vararg call with floating args passed in - // registers. - if (isVarArg) { - SDValue SetCR(DAG.getMachineNode(seenFloatArg ? PPC::CRSET : PPC::CRUNSET, - dl, MVT::i32), 0); - RegsToPass.push_back(std::make_pair(unsigned(PPC::CR1EQ), SetCR)); - } - // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; @@ -3148,6 +3146,14 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee, InFlag = Chain.getValue(1); } + // Set CR bit 6 to true if this is a vararg call with floating args passed in + // registers. + if (isVarArg) { + Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET, + dl, DAG.getVTList(MVT::Other, MVT::Glue), Chain); + InFlag = Chain.getValue(1); + } + if (isTailCall) PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp, false, TailCallArguments); diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index b0a013b4b4c..902b188da72 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -174,6 +174,10 @@ namespace llvm { /// operand #3 optional in flag TC_RETURN, + /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls + CR6SET, + CR6UNSET, + /// STD_32 - This is the STD instruction for use with "32-bit" registers. STD_32 = ISD::FIRST_TARGET_MEMORY_OPCODE, diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 20af16cea30..0230b4e7954 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -155,6 +155,12 @@ def PPClbrx : SDNode<"PPCISD::LBRX", SDT_PPClbrx, def PPCstbrx : SDNode<"PPCISD::STBRX", SDT_PPCstbrx, [SDNPHasChain, SDNPMayStore]>; +// Instructions to set/unset CR bit 6 for SVR4 vararg calls +def PPCcr6set : SDNode<"PPCISD::CR6SET", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; +def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone, + [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>; + // Instructions to support atomic operations def PPClarx : SDNode<"PPCISD::LARX", SDT_PPClarx, [SDNPHasChain, SDNPMayLoad]>; @@ -1145,6 +1151,16 @@ def CRUNSET: XLForm_1_ext<19, 193, (outs CRBITRC:$dst), (ins), "crxor $dst, $dst, $dst", BrCR, []>; +let Defs = [CR1EQ], CRD = 6 in { +def CR6SET : XLForm_1_ext<19, 289, (outs), (ins), + "creqv 6, 6, 6", BrCR, + [(PPCcr6set)]>; + +def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins), + "crxor 6, 6, 6", BrCR, + [(PPCcr6unset)]>; +} + // XFX-Form instructions. Instructions that deal with SPRs. // let Uses = [CTR] in { diff --git a/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll b/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll new file mode 100644 index 00000000000..afa1ea8e75a --- /dev/null +++ b/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s | FileCheck %s +target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32" +target triple = "powerpc-unknown-linux" + +@.str = private unnamed_addr constant [3 x i8] c"%i\00", align 1 + +define void @test(i32 %count) nounwind { +entry: +; CHECK: crxor 6, 6, 6 + %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind + %cmp2 = icmp sgt i32 %count, 0 + br i1 %cmp2, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ] +; CHECK: crxor 6, 6, 6 + %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind + %inc = add nsw i32 %i.03, 1 + %exitcond = icmp eq i32 %inc, %count + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +declare i32 @printf(i8* nocapture, ...) nounwind