From 0f885eb7fd9c21f2a2e740cba3c0ccadcfc7e03a Mon Sep 17 00:00:00 2001 From: Anton Korobeynikov Date: Tue, 24 Nov 2009 00:44:37 +0000 Subject: [PATCH] Materialize global addresses via movt/movw pair, this is always better than doing the same via constpool: 1. Load from constpool costs 3 cycles on A9, movt/movw pair - just 2. 2. Load from constpool might stall up to 300 cycles due to cache miss. 3. Movt/movw does not use load/store unit. 4. Less constpool entries => better compiler performance. This is only enabled on ELF systems, since darwin does not have needed relocations (yet). llvm-svn: 89720 --- lib/Target/ARM/ARMBaseInstrInfo.h | 16 ++++++++++ lib/Target/ARM/ARMExpandPseudoInsts.cpp | 33 ++++++++++++++------- lib/Target/ARM/ARMISelDAGToDAG.cpp | 20 ++++++++++--- lib/Target/ARM/ARMISelLowering.cpp | 16 +++++++--- lib/Target/ARM/ARMInstrInfo.td | 25 ++++++++++------ lib/Target/ARM/ARMInstrThumb2.td | 16 ++++++---- lib/Target/ARM/ARMSubtarget.cpp | 5 ++++ lib/Target/ARM/ARMSubtarget.h | 8 ++++- lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp | 21 +++++++++---- lib/Target/ARM/Thumb2SizeReduction.cpp | 8 ++++- test/CodeGen/ARM/movt-movw-global.ll | 20 +++++++++++++ 11 files changed, 147 insertions(+), 41 deletions(-) create mode 100644 test/CodeGen/ARM/movt-movw-global.ll diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index db9716e9444..dbd4f63619d 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -162,6 +162,22 @@ namespace ARMII { I_BitShift = 25, CondShift = 28 }; + + /// Target Operand Flag enum. + enum TOF { + //===------------------------------------------------------------------===// + // ARM Specific MachineOperand flags. + + MO_NO_FLAG, + + /// MO_LO16 - On a symbol operand, this represents a relocation containing + /// lower 16 bit of the address. Used only via movw instruction. + MO_LO16, + + /// MO_HI16 - On a symbol operand, this represents a relocation containing + /// higher 16 bit of the address. Used only via movt instruction. + MO_HI16 + }; } class ARMBaseInstrInfo : public TargetInstrInfoImpl { diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index 4d0f8993e00..c929c54d489 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -75,17 +75,30 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { } case ARM::t2MOVi32imm: { unsigned DstReg = MI.getOperand(0).getReg(); - unsigned Imm = MI.getOperand(1).getImm(); - unsigned Lo16 = Imm & 0xffff; - unsigned Hi16 = (Imm >> 16) & 0xffff; if (!MI.getOperand(0).isDead()) { - AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(ARM::t2MOVi16), DstReg) - .addImm(Lo16)); - AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), - TII->get(ARM::t2MOVTi16)) - .addReg(DstReg, getDefRegState(true)) - .addReg(DstReg).addImm(Hi16)); + const MachineOperand &MO = MI.getOperand(1); + MachineInstrBuilder LO16, HI16; + + LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::t2MOVi16), + DstReg); + HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::t2MOVTi16)) + .addReg(DstReg, getDefRegState(true)).addReg(DstReg); + + if (MO.isImm()) { + unsigned Imm = MO.getImm(); + unsigned Lo16 = Imm & 0xffff; + unsigned Hi16 = (Imm >> 16) & 0xffff; + LO16 = LO16.addImm(Lo16); + HI16 = HI16.addImm(Hi16); + } else { + GlobalValue *GV = MO.getGlobal(); + unsigned TF = MO.getTargetFlags(); + LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16); + HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16); + // FIXME: What's about memoperands? + } + AddDefaultPred(LO16); + AddDefaultPred(HI16); } MI.eraseFromParent(); Modified = true; diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index c7a50ed0e59..d63f3e66fa4 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -261,7 +261,9 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDValue Op, SDValue N, if (N.getOpcode() == ISD::FrameIndex) { int FI = cast(N)->getIndex(); Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); - } else if (N.getOpcode() == ARMISD::Wrapper) { + } else if (N.getOpcode() == ARMISD::Wrapper && + !(Subtarget->useMovt() && + N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { Base = N.getOperand(0); } Offset = CurDAG->getRegister(0, MVT::i32); @@ -463,7 +465,9 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue Op, SDValue N, if (N.getOpcode() == ISD::FrameIndex) { int FI = cast(N)->getIndex(); Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); - } else if (N.getOpcode() == ARMISD::Wrapper) { + } else if (N.getOpcode() == ARMISD::Wrapper && + !(Subtarget->useMovt() && + N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { Base = N.getOperand(0); } Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0), @@ -558,7 +562,13 @@ ARMDAGToDAGISel::SelectThumbAddrModeRI5(SDValue Op, SDValue N, } if (N.getOpcode() != ISD::ADD) { - Base = (N.getOpcode() == ARMISD::Wrapper) ? N.getOperand(0) : N; + if (N.getOpcode() == ARMISD::Wrapper && + !(Subtarget->useMovt() && + N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { + Base = N.getOperand(0); + } else + Base = N; + Offset = CurDAG->getRegister(0, MVT::i32); OffImm = CurDAG->getTargetConstant(0, MVT::i32); return true; @@ -681,7 +691,9 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue Op, SDValue N, Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy()); OffImm = CurDAG->getTargetConstant(0, MVT::i32); return true; - } else if (N.getOpcode() == ARMISD::Wrapper) { + } else if (N.getOpcode() == ARMISD::Wrapper && + !(Subtarget->useMovt() && + N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) { Base = N.getOperand(0); if (Base.getOpcode() == ISD::TargetConstantPool) return false; // We want to select t2LDRpci instead. diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 1549b637f95..c839fc65606 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -39,6 +39,7 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/Target/TargetOptions.h" #include "llvm/ADT/VectorExtras.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include @@ -1356,10 +1357,17 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, PseudoSourceValue::getGOT(), 0); return Result; } else { - SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); - CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); - return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, - PseudoSourceValue::getConstantPool(), 0); + // If we have T2 ops, we can materialize the address directly via movt/movw + // pair. This is always cheaper. + if (Subtarget->useMovt()) { + return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, + DAG.getTargetGlobalAddress(GV, PtrVT)); + } else { + SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); + CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); + return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr, + PseudoSourceValue::getConstantPool(), 0); + } } } diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index 92972f1e3a9..0a8ecc008ca 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -116,6 +116,10 @@ def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">; def CarryDefIsUnused : Predicate<"!N.getNode()->hasAnyUseOfValue(1)">; def CarryDefIsUsed : Predicate<"N.getNode()->hasAnyUseOfValue(1)">; +// FIXME: Eventually this will be just "hasV6T2Ops". +def UseMovt : Predicate<"Subtarget->useMovt()">; +def DontUseMovt : Predicate<"!Subtarget->useMovt()">; + //===----------------------------------------------------------------------===// // ARM Flag Definitions. @@ -204,7 +208,7 @@ def hi16 : SDNodeXFormgetZExtValue()) & 0xFFFFUL) == 0; - }], hi16>; +}], hi16>; /// imm0_65535 predicate - True if the 32-bit immediate is in the range /// [0.65535]. @@ -1002,7 +1006,7 @@ def MOVi16 : AI1<0b1000, (outs GPR:$dst), (ins i32imm:$src), let Constraints = "$src = $dst" in def MOVTi16 : AI1<0b1010, (outs GPR:$dst), (ins GPR:$src, i32imm:$imm), DPFrm, IIC_iMOVi, - "movt", "\t$dst, $imm", + "movt", "\t$dst, $imm", [(set GPR:$dst, (or (and GPR:$src, 0xffff), lo16AllZero:$imm))]>, UnaryDP, @@ -1603,12 +1607,6 @@ let Defs = // Non-Instruction Patterns // -// ConstantPool, GlobalAddress, and JumpTable -def : ARMPat<(ARMWrapper tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>; -def : ARMPat<(ARMWrapper tconstpool :$dst), (LEApcrel tconstpool :$dst)>; -def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id), - (LEApcrelJT tjumptable:$dst, imm:$id)>; - // Large immediate handling. // Two piece so_imms. @@ -1638,10 +1636,19 @@ def : ARMPat<(add GPR:$LHS, so_neg_imm2part:$RHS), // FIXME: Remove this when we can do generalized remat. let isReMaterializable = 1 in def MOVi32imm : AI1x2<(outs GPR:$dst), (ins i32imm:$src), Pseudo, IIC_iMOVi, - "movw", "\t$dst, ${src:lo16}\n\tmovt${p} $dst, ${src:hi16}", + "movw", "\t$dst, ${src:lo16}\n\tmovt${p}\t$dst, ${src:hi16}", [(set GPR:$dst, (i32 imm:$src))]>, Requires<[IsARM, HasV6T2]>; +// ConstantPool, GlobalAddress, and JumpTable +def : ARMPat<(ARMWrapper tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>, + Requires<[IsARM, DontUseMovt]>; +def : ARMPat<(ARMWrapper tconstpool :$dst), (LEApcrel tconstpool :$dst)>; +def : ARMPat<(ARMWrapper tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>, + Requires<[IsARM, UseMovt]>; +def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id), + (LEApcrelJT tjumptable:$dst, imm:$id)>; + // TODO: add,sub,and, 3-instr forms? diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index c46e30416c6..94898152992 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -1181,12 +1181,6 @@ def : T2Pat<(add GPR:$LHS, t2_so_neg_imm2part:$RHS), (t2SUBri (t2SUBri GPR:$LHS, (t2_so_neg_imm2part_1 imm:$RHS)), (t2_so_neg_imm2part_2 imm:$RHS))>; -// ConstantPool, GlobalAddress, and JumpTable -def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2LEApcrel tglobaladdr :$dst)>; -def : T2Pat<(ARMWrapper tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>; -def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id), - (t2LEApcrelJT tjumptable:$dst, imm:$id)>; - // 32-bit immediate using movw + movt. // This is a single pseudo instruction to make it re-materializable. Remove // when we can do generalized remat. @@ -1195,6 +1189,16 @@ def t2MOVi32imm : T2Ix2<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVi, "movw", "\t$dst, ${src:lo16}\n\tmovt${p}\t$dst, ${src:hi16}", [(set GPR:$dst, (i32 imm:$src))]>; +// ConstantPool, GlobalAddress, and JumpTable +def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2LEApcrel tglobaladdr :$dst)>, + Requires<[IsThumb2, DontUseMovt]>; +def : T2Pat<(ARMWrapper tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>; +def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>, + Requires<[IsThumb2, UseMovt]>; + +def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id), + (t2LEApcrelJT tjumptable:$dst, imm:$id)>; + // Pseudo instruction that combines ldr from constpool and add pc. This should // be expanded into two instructions late to allow if-conversion and // scheduling. diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index 432ed78c19a..d6b072b6c27 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -27,6 +27,10 @@ UseNEONFP("arm-use-neon-fp", cl::desc("Use NEON for single-precision FP"), cl::init(false), cl::Hidden); +static cl::opt +UseMOVT("arm-use-movt", + cl::init(true), cl::Hidden); + ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS, bool isT) : ARMArchVersion(V4T) @@ -36,6 +40,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS, , ThumbMode(Thumb1) , PostRAScheduler(false) , IsR9Reserved(ReserveR9) + , UseMovt(UseMOVT) , stackAlignment(4) , CPUString("generic") , TargetType(isELF) // Default to ELF unless otherwise specified. diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 3d0e01e99b7..b2467b073b5 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -65,6 +65,10 @@ protected: /// IsR9Reserved - True if R9 is a not available as general purpose register. bool IsR9Reserved; + /// UseMovt - True if MOVT / MOVW pairs are used for materialization of 32-bit + /// imms (including global addresses). + bool UseMovt; + /// stackAlignment - The minimum alignment known to hold of the stack frame on /// entry to the function and which must be maintained by every function. unsigned stackAlignment; @@ -130,8 +134,10 @@ protected: bool isR9Reserved() const { return IsR9Reserved; } + bool useMovt() const { return UseMovt && hasV6T2Ops(); } + const std::string & getCPUString() const { return CPUString; } - + /// enablePostRAScheduler - True at 'More' optimization. bool enablePostRAScheduler(CodeGenOpt::Level OptLevel, TargetSubtarget::AntiDepBreakMode& Mode, diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp index daffc5eb922..692bb192426 100644 --- a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp @@ -330,6 +330,8 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) { void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, const char *Modifier) { const MachineOperand &MO = MI->getOperand(OpNum); + unsigned TF = MO.getTargetFlags(); + switch (MO.getType()) { default: assert(0 && ""); @@ -356,12 +358,12 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, case MachineOperand::MO_Immediate: { int64_t Imm = MO.getImm(); O << '#'; - if (Modifier) { - if (strcmp(Modifier, "lo16") == 0) - O << ":lower16:"; - else if (strcmp(Modifier, "hi16") == 0) - O << ":upper16:"; - } + if ((Modifier && strcmp(Modifier, "lo16") == 0) || + (TF & ARMII::MO_LO16)) + O << ":lower16:"; + else if ((Modifier && strcmp(Modifier, "hi16") == 0) || + (TF & ARMII::MO_HI16)) + O << ":upper16:"; O << Imm; break; } @@ -371,6 +373,13 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum, case MachineOperand::MO_GlobalAddress: { bool isCallOp = Modifier && !strcmp(Modifier, "call"); GlobalValue *GV = MO.getGlobal(); + + if ((Modifier && strcmp(Modifier, "lo16") == 0) || + (TF & ARMII::MO_LO16)) + O << ":lower16:"; + else if ((Modifier && strcmp(Modifier, "hi16") == 0) || + (TF & ARMII::MO_HI16)) + O << ":upper16:"; O << Mang->getMangledName(GV); printOffset(MO.getOffset()); diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp index ad1739c6905..b2fd7b334d8 100644 --- a/lib/Target/ARM/Thumb2SizeReduction.cpp +++ b/lib/Target/ARM/Thumb2SizeReduction.cpp @@ -78,7 +78,7 @@ namespace { { ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 0 }, { ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 0 }, { ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0 }, - { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0 }, + { ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1 }, // FIXME: Do we need the 16-bit 'S' variant? { ARM::t2MOVr,ARM::tMOVgpr2gpr,0, 0, 0, 0, 0, 1,0, 0 }, { ARM::t2MOVCCr,0, ARM::tMOVCCr, 0, 0, 0, 0, 0,1, 0 }, @@ -413,6 +413,12 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI, if (MI->getOperand(2).getImm() == 0) return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); break; + case ARM::t2MOVi16: + // Can convert only 'pure' immediate operands, not immediates obtained as + // globals' addresses. + if (MI->getOperand(1).isImm()) + return ReduceToNarrow(MBB, MI, Entry, LiveCPSR); + break; } return false; } diff --git a/test/CodeGen/ARM/movt-movw-global.ll b/test/CodeGen/ARM/movt-movw-global.ll new file mode 100644 index 00000000000..886ff3fea7a --- /dev/null +++ b/test/CodeGen/ARM/movt-movw-global.ll @@ -0,0 +1,20 @@ +; RUN: llc < %s | FileCheck %s +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64" +target triple = "armv7-eabi" + +@foo = common global i32 0 ; [#uses=1] + +define arm_aapcs_vfpcc i32* @bar1() nounwind readnone { +entry: +; CHECK: movw r0, :lower16:foo +; CHECK-NEXT: movt r0, :upper16:foo + ret i32* @foo +} + +define arm_aapcs_vfpcc void @bar2(i32 %baz) nounwind { +entry: +; CHECK: movw r1, :lower16:foo +; CHECK-NEXT: movt r1, :upper16:foo + store i32 %baz, i32* @foo, align 4 + ret void +}