diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index fea9e21c7eb..3d060eed177 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -97,6 +97,10 @@ namespace { void findUsesOfImpDef(SmallVectorImpl &UsesOfImpDefs, const MemOpQueue &MemOps, unsigned DefReg, unsigned RangeBegin, unsigned RangeEnd); + void UpdateBaseRegUses(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc dl, unsigned Base, unsigned WordOffset, + ARMCC::CondCodes Pred, unsigned PredReg); bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, int Offset, unsigned Base, bool BaseKill, int Opcode, ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch, @@ -140,6 +144,46 @@ namespace { char ARMLoadStoreOpt::ID = 0; } +static bool definesCPSR(const MachineInstr *MI) { + for (const auto &MO : MI->operands()) { + if (!MO.isReg()) + continue; + if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead()) + // If the instruction has live CPSR def, then it's not safe to fold it + // into load / store. + return true; + } + + return false; +} + +static int getMemoryOpOffset(const MachineInstr *MI) { + int Opcode = MI->getOpcode(); + bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; + unsigned NumOperands = MI->getDesc().getNumOperands(); + unsigned OffField = MI->getOperand(NumOperands-3).getImm(); + + if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || + Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || + Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 || + Opcode == ARM::LDRi12 || Opcode == ARM::STRi12) + return OffField; + + // Thumb1 immediate offsets are scaled by 4 + if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi) + return OffField * 4; + + int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField) + : ARM_AM::getAM5Offset(OffField) * 4; + ARM_AM::AddrOpc Op = isAM3 ? ARM_AM::getAM3Op(OffField) + : ARM_AM::getAM5Op(OffField); + + if (Op == ARM_AM::sub) + return -Offset; + + return Offset; +} + static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) { switch (Opcode) { default: llvm_unreachable("Unhandled opcode!"); @@ -307,6 +351,120 @@ static bool isi32Store(unsigned Opc) { return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc); } +static unsigned getImmScale(unsigned Opc) { + switch (Opc) { + default: llvm_unreachable("Unhandled opcode!"); + case ARM::tLDRi: + case ARM::tSTRi: + return 1; + case ARM::tLDRHi: + case ARM::tSTRHi: + return 2; + case ARM::tLDRBi: + case ARM::tSTRBi: + return 4; + } +} + +/// Update future uses of the base register with the offset introduced +/// due to writeback. This function only works on Thumb1. +void +ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc dl, unsigned Base, + unsigned WordOffset, + ARMCC::CondCodes Pred, unsigned PredReg) { + assert(isThumb1 && "Can only update base register uses for Thumb1!"); + // Start updating any instructions with immediate offsets. Insert a SUB before + // the first non-updateable instruction (if any). + for (; MBBI != MBB.end(); ++MBBI) { + bool InsertSub = false; + unsigned Opc = MBBI->getOpcode(); + + if (MBBI->readsRegister(Base)) { + int Offset; + bool IsLoad = + Opc == ARM::tLDRi || Opc == ARM::tLDRHi || Opc == ARM::tLDRBi; + bool IsStore = + Opc == ARM::tSTRi || Opc == ARM::tSTRHi || Opc == ARM::tSTRBi; + + if (IsLoad || IsStore) { + // Loads and stores with immediate offsets can be updated, but only if + // the new offset isn't negative. + // The MachineOperand containing the offset immediate is the last one + // before predicates. + MachineOperand &MO = + MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3); + // The offsets are scaled by 1, 2 or 4 depending on the Opcode. + Offset = MO.getImm() - WordOffset * getImmScale(Opc); + + // If storing the base register, it needs to be reset first. + unsigned InstrSrcReg = MBBI->getOperand(0).getReg(); + + if (Offset >= 0 && !(IsStore && InstrSrcReg == Base)) + MO.setImm(Offset); + else + InsertSub = true; + + } else if ((Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) && + !definesCPSR(MBBI)) { + // SUBS/ADDS using this register, with a dead def of the CPSR. + // Merge it with the update; if the merged offset is too large, + // insert a new sub instead. + MachineOperand &MO = + MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3); + Offset = (Opc == ARM::tSUBi8) ? + MO.getImm() + WordOffset * 4 : + MO.getImm() - WordOffset * 4 ; + if (Offset >= 0 && TL->isLegalAddImmediate(Offset)) { + // FIXME: Swap ADDS<->SUBS if Offset < 0, erase instruction if + // Offset == 0. + MO.setImm(Offset); + // The base register has now been reset, so exit early. + return; + } else { + InsertSub = true; + } + + } else { + // Can't update the instruction. + InsertSub = true; + } + + } else if (definesCPSR(MBBI) || MBBI->isCall() || MBBI->isBranch()) { + // Since SUBS sets the condition flags, we can't place the base reset + // after an instruction that has a live CPSR def. + // The base register might also contain an argument for a function call. + InsertSub = true; + } + + if (InsertSub) { + // An instruction above couldn't be updated, so insert a sub. + AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base), true) + .addReg(Base, getKillRegState(false)).addImm(WordOffset * 4) + .addImm(Pred).addReg(PredReg); + return; + } + + if (MBBI->killsRegister(Base)) + // Register got killed. Stop updating. + return; + } + + // End of block was reached. + if (MBB.succ_size() > 0) { + // FIXME: Because of a bug, live registers are sometimes missing from + // the successor blocks' live-in sets. This means we can't trust that + // information and *always* have to reset at the end of a block. + // See PR21029. + if (MBBI != MBB.end()) --MBBI; + AddDefaultT1CC( + BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base), true) + .addReg(Base, getKillRegState(false)).addImm(WordOffset * 4) + .addImm(Pred).addReg(PredReg); + } +} + /// MergeOps - Create and insert a LDM or STM with Base as base register and /// registers in Regs as the register operands that would be loaded / stored. /// It returns true if the transformation is done. @@ -329,6 +487,22 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, (MBB.computeRegisterLiveness(TRI, ARM::CPSR, std::prev(MBBI), 15) == MachineBasicBlock::LQR_Dead); + bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback. + + // Exception: If the base register is in the input reglist, Thumb1 LDM is + // non-writeback. + // It's also not possible to merge an STR of the base register in Thumb1. + if (isThumb1) + for (unsigned I = 0; I < NumRegs; ++I) + if (Base == Regs[I].first) { + if (Opcode == ARM::tLDRi) { + Writeback = false; + break; + } else if (Opcode == ARM::tSTRi) { + return false; + } + } + ARM_AM::AMSubMode Mode = ARM_AM::ia; // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA. bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode); @@ -421,24 +595,16 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, Opcode = getLoadStoreMultipleOpcode(Opcode, Mode); if (!Opcode) return false; - bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback. - - // Exception: If the base register is in the input reglist, Thumb1 LDM is - // non-writeback. Check for this. - if (Opcode == ARM::tLDMIA && isThumb1) - for (unsigned I = 0; I < NumRegs; ++I) - if (Base == Regs[I].first) { - Writeback = false; - break; - } - - // If the merged instruction has writeback and the base register is not killed - // it's not safe to do the merge on Thumb1. This is because resetting the base - // register writeback by inserting a SUBS sets the condition flags. - // FIXME: Try something clever here to see if resetting the base register can - // be avoided, e.g. by updating a later ADD/SUB of the base register with the - // writeback. - if (isThumb1 && Writeback && !BaseKill) return false; + // Check if a Thumb1 LDM/STM merge is safe. This is the case if: + // - There is no writeback (LDM of base register), + // - the base register is killed by the merged instruction, + // - or it's safe to overwrite the condition flags, i.e. to insert a SUBS + // to reset the base register. + // Otherwise, don't merge. + // It's safe to return here since the code to materialize a new base register + // above is also conditional on SafeToClobberCPSR. + if (isThumb1 && !SafeToClobberCPSR && Writeback && !BaseKill) + return false; MachineInstrBuilder MIB; @@ -452,6 +618,12 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB, // Thumb1: we might need to set base writeback when building the MI. MIB.addReg(Base, getDefRegState(true)) .addReg(Base, getKillRegState(BaseKill)); + + // The base isn't dead after a merged instruction with writeback. + // Insert a sub instruction after the newly formed instruction to reset. + if (!BaseKill) + UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg); + } else { // No writeback, simply build the MachineInstr. MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode)); @@ -622,6 +794,11 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB, memOps[i].MBBI = Merges.back(); memOps[i].Position = insertPos; } + + // Update memOps offsets, since they may have been modified by MergeOps. + for (auto &MemOp : memOps) { + MemOp.Offset = getMemoryOpOffset(MemOp.MBBI); + } } /// MergeLDR_STR - Merge a number of load / store instructions into one or more @@ -704,20 +881,6 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges); } -static bool definesCPSR(MachineInstr *MI) { - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg()) - continue; - if (MO.isDef() && MO.getReg() == ARM::CPSR && !MO.isDead()) - // If the instruction has live CPSR def, then it's not safe to fold it - // into load / store. - return true; - } - - return false; -} - static bool isMatchingDecrement(MachineInstr *MI, unsigned Base, unsigned Bytes, unsigned Limit, ARMCC::CondCodes Pred, unsigned PredReg) { @@ -1255,34 +1418,6 @@ void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) { RS->forward(std::prev(Loc)); } -static int getMemoryOpOffset(const MachineInstr *MI) { - int Opcode = MI->getOpcode(); - bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD; - unsigned NumOperands = MI->getDesc().getNumOperands(); - unsigned OffField = MI->getOperand(NumOperands-3).getImm(); - - if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 || - Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 || - Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8 || - Opcode == ARM::LDRi12 || Opcode == ARM::STRi12) - return OffField; - - // Thumb1 immediate offsets are scaled by 4 - if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi) - return OffField * 4; - - int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField) - : ARM_AM::getAM5Offset(OffField) * 4; - if (isAM3) { - if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub) - Offset = -Offset; - } else { - if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub) - Offset = -Offset; - } - return Offset; -} - static void InsertLDR_STR(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, int Offset, bool isDef, diff --git a/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll b/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll index 18e17766313..cfa1159bda2 100644 --- a/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll +++ b/test/CodeGen/Thumb/2014-06-10-thumb1-ldst-opt-bug.ll @@ -4,8 +4,8 @@ define void @foo(i32* %A) #0 { entry: ; CHECK-LABEL: foo: ; CHECK: push {r7, lr} -; CHECK: ldr -; CHECK-NEXT: ldr +; CHECK: ldm +; CHECK-NEXT: subs ; CHECK-NEXT: bl %0 = load i32* %A, align 4 %arrayidx1 = getelementptr inbounds i32* %A, i32 1 diff --git a/test/CodeGen/Thumb/dyn-stackalloc.ll b/test/CodeGen/Thumb/dyn-stackalloc.ll index 706a21350d0..309d80217c1 100644 --- a/test/CodeGen/Thumb/dyn-stackalloc.ll +++ b/test/CodeGen/Thumb/dyn-stackalloc.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK +; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=RA_GREEDY +; RUN: llc < %s -mtriple=thumb-apple-darwin -disable-cgp-branch-opts -disable-post-ra -regalloc=basic -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -check-prefix=RA_BASIC %struct.state = type { i32, %struct.info*, float**, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i8* } %struct.info = type { i32, i32, i32, i32, i32, i32, i32, i8* } @@ -45,7 +45,8 @@ define void @t2(%struct.comment* %vc, i8* %tag, i8* %contents) { ; CHECK: sub sp, # ; CHECK: mov r[[R0:[0-9]+]], sp ; CHECK: str r{{[0-9+]}}, [r[[R0]] -; CHECK: str r{{[0-9+]}}, [r[[R0]] +; RA_GREEDY: str r{{[0-9+]}}, [r[[R0]] +; RA_BASIC: stm r[[R0]]! ; CHECK-NOT: ldr r0, [sp ; CHECK: mov r[[R1:[0-9]+]], sp ; CHECK: subs r[[R2:[0-9]+]], r[[R1]], r{{[0-9]+}} diff --git a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll index c97036289c8..da2f3f09b28 100644 --- a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll +++ b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll @@ -1,5 +1,4 @@ ; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s - @d = external global [64 x i32] @s = external global [64 x i32] @@ -7,8 +6,12 @@ define void @t1() #0 { entry: ; CHECK-LABEL: t1: -; CHECK-NOT: ldm -; CHECK-NOT: stm +; CHECK: ldr r[[LB:[0-9]]], +; CHECK-NEXT: ldm r[[LB]]!, +; CHECK-NEXT: ldr r[[SB:[0-9]]], +; CHECK-NEXT: stm r[[SB]]!, +; CHECK-NEXT: ldrb {{.*}}, [r[[LB]]] +; CHECK-NEXT: strb {{.*}}, [r[[SB]]] tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 17, i32 4, i1 false) ret void } @@ -17,8 +20,14 @@ entry: define void @t2() #0 { entry: ; CHECK-LABEL: t2: -; CHECK-NOT: ldm -; CHECK-NOT: stm +; CHECK: ldr r[[LB:[0-9]]], +; CHECK-NEXT: ldm r[[LB]]!, +; CHECK-NEXT: ldr r[[SB:[0-9]]], +; CHECK-NEXT: stm r[[SB]]!, +; CHECK-NEXT: ldrh {{.*}}, [r[[LB]]] +; CHECK-NEXT: ldrb {{.*}}, [r[[LB]], #2] +; CHECK-NEXT: strb {{.*}}, [r[[SB]], #2] +; CHECK-NEXT: strh {{.*}}, [r[[SB]]] tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 15, i32 4, i1 false) ret void }