1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[ARM] Thumb2: favor R4-R7 over R12/LR in allocation order when opt for minsize

For Thumb2, we prefer low regs (costPerUse = 0) to allow narrow
encoding. However, current allocation order is like:
  R0-R3, R12, LR, R4-R11

As a result, a lot of instructs that use R12/LR will be wide instrs.

This patch changes the allocation order to:
  R0-R7, R12, LR, R8-R11
for thumb2 and -Osize.

In most cases, there is no extra push/pop instrs as they will be folded
into existing ones. There might be slight performance impact due to more
stack usage, so we only enable it when opt for min size.

https://reviews.llvm.org/D30324

llvm-svn: 365014
This commit is contained in:
Oliver Stannard 2019-07-03 09:58:52 +00:00
parent 6f97dc2303
commit e6ef9c7af1
7 changed files with 96 additions and 8 deletions

View File

@ -291,6 +291,14 @@ public:
/// This is called after a .mir file was loaded.
virtual void mirFileLoaded(MachineFunction &MF) const;
/// True if the register allocator should use the allocation orders exactly as
/// written in the tablegen descriptions, false if it should allocate
/// the specified physical register later if is it callee-saved.
virtual bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
unsigned PhysReg) const {
return false;
}
};
} // end namespace llvm

View File

@ -90,6 +90,7 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
assert(RC && "no register class given");
RCInfo &RCI = RegClass[RC->getID()];
auto &STI = MF->getSubtarget();
// Raw register count, including all reserved regs.
unsigned NumRegs = RC->getNumRegs();
@ -114,7 +115,8 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
unsigned Cost = TRI->getCostPerUse(PhysReg);
MinCost = std::min(MinCost, Cost);
if (CalleeSavedAliases[PhysReg])
if (CalleeSavedAliases[PhysReg] &&
!STI.ignoreCSRForAllocationOrder(*MF, PhysReg))
// PhysReg aliases a CSR, save it for later.
CSRAlias.push_back(PhysReg);
else {

View File

@ -227,9 +227,10 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
// know how to spill them. If we make our prologue/epilogue code smarter at
// some point, we can go back to using the above allocation orders for the
// Thumb1 instructions that know how to use hi regs.
let AltOrders = [(add LR, GPR), (trunc GPR, 8)];
let AltOrders = [(add LR, GPR), (trunc GPR, 8),
(add (trunc GPR, 8), R12, LR, (shl GPR, 8))];
let AltOrderSelect = [{
return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
}];
let DiagnosticString = "operand must be a register in range [r0, r15]";
}
@ -238,9 +239,10 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
// certain operand slots, particularly as the destination. Primarily
// useful for disassembly.
def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8),
(add (trunc GPRnopc, 8), R12, LR, (shl GPRnopc, 8))];
let AltOrderSelect = [{
return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
}];
let DiagnosticString = "operand must be a register in range [r0, r14]";
}
@ -295,9 +297,10 @@ def GPRlr : RegisterClass<"ARM", [i32], 32, (add LR)>;
// or SP (R13 or R15) are used. The ARM ISA refers to these operands
// via the BadReg() pseudo-code description.
def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)];
let AltOrders = [(add LR, rGPR), (trunc rGPR, 8),
(add (trunc rGPR, 8), R12, LR, (shl rGPR, 8))];
let AltOrderSelect = [{
return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
}];
let DiagnosticType = "rGPR";
}

View File

@ -413,3 +413,45 @@ bool ARMSubtarget::useFastISel() const {
((isTargetMachO() && !isThumb1Only()) ||
(isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb()));
}
unsigned ARMSubtarget::getGPRAllocationOrder(const MachineFunction &MF) const {
// The GPR register class has multiple possible allocation orders, with
// tradeoffs preferred by different sub-architectures and optimisation goals.
// The allocation orders are:
// 0: (the default tablegen order, not used)
// 1: r14, r0-r13
// 2: r0-r7
// 3: r0-r7, r12, lr, r8-r11
// Note that the register allocator will change this order so that
// callee-saved registers are used later, as they require extra work in the
// prologue/epilogue (though we sometimes override that).
// For thumb1-only targets, only the low registers are allocatable.
if (isThumb1Only())
return 2;
// Allocate low registers first, so we can select more 16-bit instructions.
// We also (in ignoreCSRForAllocationOrder) override the default behaviour
// with regards to callee-saved registers, because pushing extra registers is
// much cheaper (in terms of code size) than using high registers. After
// that, we allocate r12 (doesn't need to be saved), lr (saving it means we
// can return with the pop, don't need an extra "bx lr") and then the rest of
// the high registers.
if (isThumb2() && MF.getFunction().hasMinSize())
return 3;
// Otherwise, allocate in the default order, using LR first because saving it
// allows a shorter epilogue sequence.
return 1;
}
bool ARMSubtarget::ignoreCSRForAllocationOrder(const MachineFunction &MF,
unsigned PhysReg) const {
// To minimize code size in Thumb2, we prefer the usage of low regs (lower
// cost per use) so we can use narrow encoding. By default, caller-saved
// registers (e.g. lr, r12) are always allocated first, regardless of
// their cost per use. When optForMinSize, we prefer the low regs even if
// they are CSR because usually push/pop can be folded into existing ones.
return isThumb2() && MF.getFunction().hasMinSize() &&
ARM::GPRRegClass.contains(PhysReg);
}

View File

@ -856,6 +856,10 @@ public:
unsigned getPrefLoopAlignment() const {
return PrefLoopAlignment;
}
bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
unsigned PhysReg) const override;
unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
};
} // end namespace llvm

View File

@ -60,7 +60,7 @@ entry:
while.body:
; CHECK: while.body
; CHECK: mul r{{[0-9]+}}
; CHECK: muls r{{[0-9]+}}
; CHECK: muls
%ptr1.addr.09 = phi i32* [ %add.ptr, %while.body ], [ %ptr1, %entry ]
%ptr2.addr.08 = phi i32* [ %incdec.ptr, %while.body ], [ %ptr2, %entry ]

View File

@ -0,0 +1,29 @@
; REQUIRES: asserts
; RUN: llc -debug-only=regalloc < %s 2>%t | FileCheck %s --check-prefix=CHECK
; RUN: FileCheck %s < %t --check-prefix=DEBUG
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
target triple = "thumbv7m--linux-gnueabi"
; DEBUG: AllocationOrder(GPR) = [ $r0 $r1 $r2 $r3 $r4 $r5 $r6 $r7 $r12 $lr $r8 $r9 $r10 $r11 ]
define i32 @test_minsize(i32 %x) optsize minsize {
; CHECK-LABEL: test_minsize:
entry:
; CHECK: mov r4, r0
tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
; CHECK: mov r0, r4
ret i32 %x
}
; DEBUG: AllocationOrder(GPR) = [ $r0 $r1 $r2 $r3 $r12 $lr $r4 $r5 $r6 $r7 $r8 $r9 $r10 $r11 ]
define i32 @test_optsize(i32 %x) optsize {
; CHECK-LABEL: test_optsize:
entry:
; CHECK: mov r12, r0
tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
; CHECK: mov r0, r12
ret i32 %x
}