From 6cbd4fbe85c0318f0eef8e83e8e1c10a0f8ccf28 Mon Sep 17 00:00:00 2001 From: Manman Ren Date: Tue, 12 Jan 2016 00:47:18 +0000 Subject: [PATCH] CXX_FAST_TLS calling convention: performance improvement for ARM. This is the same change on ARM as r255821 on AArch64. rdar://9001553 llvm-svn: 257424 --- lib/Target/ARM/ARMBaseRegisterInfo.cpp | 13 ++++- lib/Target/ARM/ARMBaseRegisterInfo.h | 2 + lib/Target/ARM/ARMCallingConv.td | 6 +++ lib/Target/ARM/ARMFastISel.cpp | 3 ++ lib/Target/ARM/ARMISelLowering.cpp | 59 +++++++++++++++++++++++ lib/Target/ARM/ARMISelLowering.h | 9 ++++ lib/Target/ARM/ARMMachineFunctionInfo.cpp | 3 +- lib/Target/ARM/ARMMachineFunctionInfo.h | 9 +++- test/CodeGen/ARM/cxx-tlscc.ll | 18 ++++--- 9 files changed, 111 insertions(+), 11 deletions(-) diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index 595975de905..a5207705fc6 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -88,10 +88,21 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { } if (STI.isTargetDarwin() && F->getCallingConv() == CallingConv::CXX_FAST_TLS) - return CSR_iOS_CXX_TLS_SaveList; + return MF->getInfo()->isSplitCSR() + ? CSR_iOS_CXX_TLS_PE_SaveList + : CSR_iOS_CXX_TLS_SaveList; return RegList; } +const MCPhysReg *ARMBaseRegisterInfo::getCalleeSavedRegsViaCopy( + const MachineFunction *MF) const { + assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getInfo()->isSplitCSR()) + return CSR_iOS_CXX_TLS_ViaCopy_SaveList; + return nullptr; +} + const uint32_t * ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h index 4c762dc1a23..6a9a45a6568 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.h +++ b/lib/Target/ARM/ARMBaseRegisterInfo.h @@ -98,6 +98,8 @@ protected: public: /// Code Generation virtual methods... const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; + const MCPhysReg * + getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override; const uint32_t *getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override; const uint32_t *getNoPreservedMask() const override; diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td index e386a2e5cff..847ef87c1b2 100644 --- a/lib/Target/ARM/ARMCallingConv.td +++ b/lib/Target/ARM/ARMCallingConv.td @@ -234,6 +234,12 @@ def CSR_iOS_TLSCall : CalleeSavedRegs<(add LR, SP, def CSR_iOS_CXX_TLS : CalleeSavedRegs<(add CSR_iOS, (sequence "R%u", 12, 1), (sequence "D%u", 31, 0))>; +// CSRs that are handled by prologue, epilogue. +def CSR_iOS_CXX_TLS_PE : CalleeSavedRegs<(add LR)>; + +// CSRs that are handled explicitly via copies. +def CSR_iOS_CXX_TLS_ViaCopy : CalleeSavedRegs<(sub CSR_iOS_CXX_TLS, LR)>; + // The "interrupt" attribute is used to generate code that is acceptable in // exception-handlers of various kinds. It makes us use a different return // instruction (handled elsewhere) and affects which registers we must return to diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index b5f1ac43bc9..ff2fcfa349d 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -2083,6 +2083,9 @@ bool ARMFastISel::SelectRet(const Instruction *I) { if (!FuncInfo.CanLowerReturn) return false; + if (TLI.supportSplitCSR(FuncInfo.MF)) + return false; + // Build a list of return value registers. SmallVector RetRegs; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 3c7356ada6c..37c0795af28 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -2348,6 +2348,19 @@ ARMTargetLowering::LowerReturn(SDValue Chain, Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } + const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *I = + TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); + if (I) { + for (; *I; ++I) { + if (ARM::GPRRegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::i32)); + else if (ARM::DPRRegClass.contains(*I)) + RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + } + } // Update chain and glue. RetOps[0] = Chain; @@ -12393,3 +12406,49 @@ unsigned ARMTargetLowering::getExceptionSelectorRegister( // via the personality function. return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; } + +void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { + // Update IsSplitCSR in ARMFunctionInfo. + ARMFunctionInfo *AFI = Entry->getParent()->getInfo(); + AFI->setIsSplitCSR(true); +} + +void ARMTargetLowering::insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const { + const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); + if (!IStart) + return; + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); + for (const MCPhysReg *I = IStart; *I; ++I) { + const TargetRegisterClass *RC = nullptr; + if (ARM::GPRRegClass.contains(*I)) + RC = &ARM::GPRRegClass; + else if (ARM::DPRRegClass.contains(*I)) + RC = &ARM::DPRRegClass; + else + llvm_unreachable("Unexpected register class in CSRsViaCopy!"); + + unsigned NewVR = MRI->createVirtualRegister(RC); + // Create copy from CSR to a virtual register. + // FIXME: this currently does not emit CFI pseudo-instructions, it works + // fine for CXX_FAST_TLS since the C++-style TLS access functions should be + // nounwind. If we want to generalize this later, we may need to emit + // CFI pseudo-instructions. + assert(Entry->getParent()->getFunction()->hasFnAttribute( + Attribute::NoUnwind) && + "Function should be nounwind in insertCopiesSplitCSR!"); + Entry->addLiveIn(*I); + BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + NewVR) + .addReg(*I); + + for (auto *Exit : Exits) + BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY), + *I) + .addReg(NewVR); + } +} diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 19aac816498..96b56c3ec33 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -580,6 +580,15 @@ namespace llvm { SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const; + bool supportSplitCSR(MachineFunction *MF) const override { + return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + } + void initializeSplitCSR(MachineBasicBlock *Entry) const override; + void insertCopiesSplitCSR( + MachineBasicBlock *Entry, + const SmallVectorImpl &Exits) const override; + SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/lib/Target/ARM/ARMMachineFunctionInfo.cpp index ac0330fbcb3..71ad7a4a732 100644 --- a/lib/Target/ARM/ARMMachineFunctionInfo.cpp +++ b/lib/Target/ARM/ARMMachineFunctionInfo.cpp @@ -20,4 +20,5 @@ ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF) RestoreSPFromFP(false), LRSpilledForFarJump(false), FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0), - PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false) {} + PICLabelUId(0), VarArgsFrameIndex(0), HasITBlocks(false), + IsSplitCSR(false) {} diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h index d6447978ef2..68f9aec8cae 100644 --- a/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -118,6 +118,10 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// coalesced weights. DenseMap CoalescedWeights; + /// True if this function has a subset of CSRs that is handled explicitly via + /// copies. + bool IsSplitCSR; + public: ARMFunctionInfo() : isThumb(false), @@ -128,7 +132,7 @@ public: FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0), GPRCS1Size(0), GPRCS2Size(0), DPRCSAlignGapSize(0), DPRCSSize(0), NumAlignedDPRCS2Regs(0), PICLabelUId(0), - VarArgsFrameIndex(0), HasITBlocks(false) {} + VarArgsFrameIndex(0), HasITBlocks(false), IsSplitCSR(false) {} explicit ARMFunctionInfo(MachineFunction &MF); @@ -199,6 +203,9 @@ public: bool hasITBlocks() const { return HasITBlocks; } void setHasITBlocks(bool h) { HasITBlocks = h; } + bool isSplitCSR() const { return IsSplitCSR; } + void setIsSplitCSR(bool s) { IsSplitCSR = s; } + void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) { if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second) llvm_unreachable("Duplicate entries!"); diff --git a/test/CodeGen/ARM/cxx-tlscc.ll b/test/CodeGen/ARM/cxx-tlscc.ll index 7fb9b188817..7b776d4b8e8 100644 --- a/test/CodeGen/ARM/cxx-tlscc.ll +++ b/test/CodeGen/ARM/cxx-tlscc.ll @@ -28,17 +28,19 @@ __tls_init.exit: } ; CHECK-LABEL: _ZTW2sg -; CHECK: push {r1, r2, r3, r4, r7, lr} -; CHECK: push {r9, r12} -; CHECK: vpush {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31} -; CHECK: vpush {d0, d1, d2, d3, d4, d5, d6, d7} +; CHECK: push {lr} +; CHECK-NOT: push {r1, r2, r3, r4, r7, lr} +; CHECK-NOT: push {r9, r12} +; CHECK-NOT: vpush {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31} +; CHECK-NOT: vpush {d0, d1, d2, d3, d4, d5, d6, d7} ; CHECK: blx ; CHECK: bne [[BB_end:.?LBB0_[0-9]+]] ; CHECK; blx ; CHECK: tlv_atexit ; CHECK: [[BB_end]]: ; CHECK: blx -; CHECK: vpop {d0, d1, d2, d3, d4, d5, d6, d7} -; CHECK: vpop {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31} -; CHECK: pop {r9, r12} -; CHECK: pop {r1, r2, r3, r4, r7, pc} +; CHECK-NOT: vpop {d0, d1, d2, d3, d4, d5, d6, d7} +; CHECK-NOT: vpop {d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31} +; CHECK-NOT: pop {r9, r12} +; CHECK-NOT: pop {r1, r2, r3, r4, r7, pc} +; CHECK: pop {lr}