On x86-64, for a varargs function, don't store the xmm registers to

the register save area if %al is 0. This avoids touching xmm regsiters when they aren't actually used. llvm-svn: 79061
2024-11-23 11:13:28 +01:00 · 2009-08-15 01:38:56 +00:00 · 2009-08-15 01:38:56 +00:00 · d69323d37a
commit d69323d37a
parent b6add4ace3
4 changed files with 146 additions and 18 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -1527,37 +1527,44 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
      // Store the integer parameter registers.
      SmallVector<SDValue, 8> MemOps;
      SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
-      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
-                                  DAG.getIntPtrConstant(VarArgsGPOffset));
+      unsigned Offset = VarArgsGPOffset;
      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
+        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
+                                  DAG.getIntPtrConstant(Offset));
        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
                                     X86::GR64RegisterClass);
        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
        SDValue Store =
          DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
+                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
+                       Offset);
        MemOps.push_back(Store);
-        FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
-                          DAG.getIntPtrConstant(8));
+        Offset += 8;
      }

+      if (!MemOps.empty())
+          Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                             &MemOps[0], MemOps.size());
+
      // Now store the XMM (fp + vector) parameter registers.
-      FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
-                        DAG.getIntPtrConstant(VarArgsFPOffset));
+      SmallVector<SDValue, 11> SaveXMMOps;
+      SaveXMMOps.push_back(Chain);
+
+      unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
+      SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
+      SaveXMMOps.push_back(ALVal);
+
+      SaveXMMOps.push_back(DAG.getIntPtrConstant(RegSaveFrameIndex));
+      SaveXMMOps.push_back(DAG.getIntPtrConstant(VarArgsFPOffset));
+
      for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
        unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs],
                                     X86::VR128RegisterClass);
        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
-        SDValue Store =
-          DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
-        MemOps.push_back(Store);
-        FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
-                          DAG.getIntPtrConstant(16));
+        SaveXMMOps.push_back(Val);
      }
-      if (!MemOps.empty())
-          Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                             &MemOps[0], MemOps.size());
+      Chain = DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, MVT::Other,
+                          &SaveXMMOps[0], SaveXMMOps.size());
    }
  }

@ -7090,6 +7097,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
  case X86ISD::DEC:                return "X86ISD::DEC";
  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
  case X86ISD::PTEST:              return "X86ISD::PTEST";
+  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
  }
 }

@ -7513,7 +7521,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
  F->insert(MBBIter, newMBB);
  F->insert(MBBIter, nextMBB);

-  // Move all successors to thisMBB to nextMBB
+  // Move all successors of thisMBB to nextMBB
  nextMBB->transferSuccessors(thisMBB);

  // Update thisMBB to fall through to newMBB
@ -7585,6 +7593,73 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
  return nextMBB;
 }

+MachineBasicBlock *
+X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
+                                                 MachineInstr *MI,
+                                                 MachineBasicBlock *MBB) const {
+  // Emit code to save XMM registers to the stack. The ABI says that the
+  // number of registers to save is given in %al, so it's theoretically
+  // possible to do an indirect jump trick to avoid saving all of them,
+  // however this code takes a simpler approach and just executes all
+  // of the stores if %al is non-zero. It's less code, and it's probably
+  // easier on the hardware branch predictor, and stores aren't all that
+  // expensive anyway.
+
+  // Create the new basic blocks. One block contains all the XMM stores,
+  // and one block is the final destination regardless of whether any
+  // stores were performed.
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction *F = MBB->getParent();
+  MachineFunction::iterator MBBIter = MBB;
+  ++MBBIter;
+  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, XMMSaveMBB);
+  F->insert(MBBIter, EndMBB);
+
+  // Set up the CFG.
+  // Move any original successors of MBB to the end block.
+  EndMBB->transferSuccessors(MBB);
+  // The original block will now fall through to the XMM save block.
+  MBB->addSuccessor(XMMSaveMBB);
+  // The XMMSaveMBB will fall through to the end block.
+  XMMSaveMBB->addSuccessor(EndMBB);
+
+  // Now add the instructions.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned CountReg = MI->getOperand(0).getReg();
+  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
+  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
+
+  if (!Subtarget->isTargetWin64()) {
+    // If %al is 0, branch around the XMM save block.
+    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
+    BuildMI(MBB, DL, TII->get(X86::JE)).addMBB(EndMBB);
+    MBB->addSuccessor(EndMBB);
+  }
+
+  // In the XMM save block, save all the XMM argument registers.
+  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
+    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
+    BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
+      .addFrameIndex(RegSaveFrameIndex)
+      .addImm(/*Scale=*/1)
+      .addReg(/*IndexReg=*/0)
+      .addImm(/*Disp=*/Offset)
+      .addReg(/*Segment=*/0)
+      .addReg(MI->getOperand(i).getReg())
+      .addMemOperand(MachineMemOperand(
+                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
+                       MachineMemOperand::MOStore, Offset,
+                       /*Size=*/16, /*Align=*/16));
+  }
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+
+  return EndMBB;
+}

 MachineBasicBlock *
 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
@ -7888,6 +7963,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                               X86::MOV32rr, X86::MOV32rr,
                                               X86::MOV32ri, X86::MOV32ri,
                                               false);
+  case X86::VASTART_SAVE_XMM_REGS:
+    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
  }
 }

--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -243,7 +243,12 @@ namespace llvm {
      MUL_IMM,
      
      // PTEST - Vector bitwise comparisons
-      PTEST
+      PTEST,
+
+      // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
+      // according to %al. An operator is needed so that this can be expanded
+      // with control flow.
+      VASTART_SAVE_XMM_REGS
    };
  }

@ -715,6 +720,11 @@ namespace llvm {
                                                          MachineBasicBlock *BB,
                                                        unsigned cmovOpc) const;

+    /// Utility function to emit the xmm reg save portion of va_start.
+    MachineBasicBlock *EmitVAStartSaveXMMRegsWithCustomInserter(
+                                                   MachineInstr *BInstr,
+                                                   MachineBasicBlock *BB) const;
+
    /// Emit nodes that will be selected as "test Op0,Op0", or something
    /// equivalent, for use with the given x86 condition code.
    SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG);
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@ -56,6 +56,10 @@ def SDT_X86CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,

 def SDT_X86Call   : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;

+def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
+                                                         SDTCisVT<1, iPTR>,
+                                                         SDTCisVT<2, iPTR>]>;
+
 def SDTX86RepStr  : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;

 def SDTX86RdTsc   : SDTypeProfile<0, 0, []>;
@ -114,6 +118,11 @@ def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary,
 def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
                        [SDNPHasChain, SDNPOptInFlag]>;

+def X86vastart_save_xmm_regs :
+                 SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
+                        SDT_X86VASTART_SAVE_XMM_REGS,
+                        [SDNPHasChain]>;
+
 def X86callseq_start :
                 SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
                        [SDNPHasChain, SDNPOutFlag]>;
@ -511,6 +520,18 @@ def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                          Requires<[In32BitMode]>;
 }

+// x86-64 va_start lowering magic.
+let usesCustomDAGSchedInserter = 1 in
+def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
+                              (outs),
+                              (ins GR8:$al,
+                                   i64imm:$regsavefi, i64imm:$offset,
+                                   variable_ops),
+                              "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
+                              [(X86vastart_save_xmm_regs GR8:$al,
+                                                         imm:$regsavefi,
+                                                         imm:$offset)]>;
+
 // Nop
 let neverHasSideEffects = 1 in {
  def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>;
--- a/test/CodeGen/X86/stdarg.ll
+++ b/test/CodeGen/X86/stdarg.ll
@ -0,0 +1,20 @@
+; RUN: llvm-as < %s | llc -march=x86-64 | grep {testb	\[%\]al, \[%\]al}
+
+%struct.__va_list_tag = type { i32, i32, i8*, i8* }
+
+define void @foo(i32 %x, ...) nounwind {
+entry:
+  %ap = alloca [1 x %struct.__va_list_tag], align 8; <[1 x %struct.__va_list_tag]*> [#uses=2]
+  %ap12 = bitcast [1 x %struct.__va_list_tag]* %ap to i8*; <i8*> [#uses=2]
+  call void @llvm.va_start(i8* %ap12)
+  %ap3 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i64 0, i64 0; <%struct.__va_list_tag*> [#uses=1]
+  call void @bar(%struct.__va_list_tag* %ap3) nounwind
+  call void @llvm.va_end(i8* %ap12)
+  ret void
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+declare void @bar(%struct.__va_list_tag*)
+
+declare void @llvm.va_end(i8*) nounwind