[ARM] Thumb2: favor R4-R7 over R12/LR in allocation order when opt for minsize

For Thumb2, we prefer low regs (costPerUse = 0) to allow narrow encoding. However, current allocation order is like: R0-R3, R12, LR, R4-R11 As a result, a lot of instructs that use R12/LR will be wide instrs. This patch changes the allocation order to: R0-R7, R12, LR, R8-R11 for thumb2 and -Osize. In most cases, there is no extra push/pop instrs as they will be folded into existing ones. There might be slight performance impact due to more stack usage, so we only enable it when opt for min size. https://reviews.llvm.org/D30324 llvm-svn: 365014
2025-01-31 12:41:49 +01:00 · 2019-07-03 09:58:52 +00:00 · 2019-07-03 09:58:52 +00:00 · e6ef9c7af1
commit e6ef9c7af1
parent 6f97dc2303
7 changed files with 96 additions and 8 deletions
--- a/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/include/llvm/CodeGen/TargetSubtargetInfo.h
@ -291,6 +291,14 @@ public:

  /// This is called after a .mir file was loaded.
  virtual void mirFileLoaded(MachineFunction &MF) const;
+
+  /// True if the register allocator should use the allocation orders exactly as
+  /// written in the tablegen descriptions, false if it should allocate
+  /// the specified physical register later if is it callee-saved.
+  virtual bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                           unsigned PhysReg) const {
+    return false;
+  }
 };

 } // end namespace llvm
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@ -90,6 +90,7 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
 void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
  assert(RC && "no register class given");
  RCInfo &RCI = RegClass[RC->getID()];
+  auto &STI = MF->getSubtarget();

  // Raw register count, including all reserved regs.
  unsigned NumRegs = RC->getNumRegs();
@ -114,7 +115,8 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
    unsigned Cost = TRI->getCostPerUse(PhysReg);
    MinCost = std::min(MinCost, Cost);

-    if (CalleeSavedAliases[PhysReg])
+    if (CalleeSavedAliases[PhysReg] &&
+        !STI.ignoreCSRForAllocationOrder(*MF, PhysReg))
      // PhysReg aliases a CSR, save it for later.
      CSRAlias.push_back(PhysReg);
    else {
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@ -227,9 +227,10 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
  // know how to spill them. If we make our prologue/epilogue code smarter at
  // some point, we can go back to using the above allocation orders for the
  // Thumb1 instructions that know how to use hi regs.
-  let AltOrders = [(add LR, GPR), (trunc GPR, 8)];
+  let AltOrders = [(add LR, GPR), (trunc GPR, 8),
+                   (add (trunc GPR, 8), R12, LR, (shl GPR, 8))];
  let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
  }];
  let DiagnosticString = "operand must be a register in range [r0, r15]";
 }
@ -238,9 +239,10 @@ def GPR : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12),
 // certain operand slots, particularly as the destination.  Primarily
 // useful for disassembly.
 def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
-  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8),
+                   (add (trunc GPRnopc, 8), R12, LR, (shl GPRnopc, 8))];
  let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
  }];
  let DiagnosticString = "operand must be a register in range [r0, r14]";
 }
@ -295,9 +297,10 @@ def GPRlr : RegisterClass<"ARM", [i32], 32, (add LR)>;
 // or SP (R13 or R15) are used. The ARM ISA refers to these operands
 // via the BadReg() pseudo-code description.
 def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
-  let AltOrders = [(add LR, rGPR), (trunc rGPR, 8)];
+  let AltOrders = [(add LR, rGPR), (trunc rGPR, 8),
+                   (add (trunc rGPR, 8), R12, LR, (shl rGPR, 8))];
  let AltOrderSelect = [{
-      return 1 + MF.getSubtarget<ARMSubtarget>().isThumb1Only();
+      return MF.getSubtarget<ARMSubtarget>().getGPRAllocationOrder(MF);
  }];
  let DiagnosticType = "rGPR";
 }
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@ -413,3 +413,45 @@ bool ARMSubtarget::useFastISel() const {
         ((isTargetMachO() && !isThumb1Only()) ||
          (isTargetLinux() && !isThumb()) || (isTargetNaCl() && !isThumb()));
 }
+
+unsigned ARMSubtarget::getGPRAllocationOrder(const MachineFunction &MF) const {
+  // The GPR register class has multiple possible allocation orders, with
+  // tradeoffs preferred by different sub-architectures and optimisation goals.
+  // The allocation orders are:
+  // 0: (the default tablegen order, not used)
+  // 1: r14, r0-r13
+  // 2: r0-r7
+  // 3: r0-r7, r12, lr, r8-r11
+  // Note that the register allocator will change this order so that
+  // callee-saved registers are used later, as they require extra work in the
+  // prologue/epilogue (though we sometimes override that).
+
+  // For thumb1-only targets, only the low registers are allocatable.
+  if (isThumb1Only())
+    return 2;
+
+  // Allocate low registers first, so we can select more 16-bit instructions.
+  // We also (in ignoreCSRForAllocationOrder) override  the default behaviour
+  // with regards to callee-saved registers, because pushing extra registers is
+  // much cheaper (in terms of code size) than using high registers. After
+  // that, we allocate r12 (doesn't need to be saved), lr (saving it means we
+  // can return with the pop, don't need an extra "bx lr") and then the rest of
+  // the high registers.
+  if (isThumb2() && MF.getFunction().hasMinSize())
+    return 3;
+
+  // Otherwise, allocate in the default order, using LR first because saving it
+  // allows a shorter epilogue sequence.
+  return 1;
+}
+
+bool ARMSubtarget::ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                               unsigned PhysReg) const {
+  // To minimize code size in Thumb2, we prefer the usage of low regs (lower
+  // cost per use) so we can  use narrow encoding. By default, caller-saved
+  // registers (e.g. lr, r12) are always  allocated first, regardless of
+  // their cost per use. When optForMinSize, we prefer the low regs even if
+  // they are CSR because usually push/pop can be folded into existing ones.
+  return isThumb2() && MF.getFunction().hasMinSize() &&
+         ARM::GPRRegClass.contains(PhysReg);
+}
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@ -856,6 +856,10 @@ public:
  unsigned getPrefLoopAlignment() const {
    return PrefLoopAlignment;
  }
+
+  bool ignoreCSRForAllocationOrder(const MachineFunction &MF,
+                                   unsigned PhysReg) const override;
+  unsigned getGPRAllocationOrder(const MachineFunction &MF) const;
 };

 } // end namespace llvm
--- a/test/CodeGen/ARM/avoid-cpsr-rmw.ll
+++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
@ -60,7 +60,7 @@ entry:

 while.body:
 ; CHECK: while.body
-; CHECK: mul r{{[0-9]+}}
+; CHECK: muls r{{[0-9]+}}
 ; CHECK: muls
  %ptr1.addr.09 = phi i32* [ %add.ptr, %while.body ], [ %ptr1, %entry ]
  %ptr2.addr.08 = phi i32* [ %incdec.ptr, %while.body ], [ %ptr2, %entry ]
--- a/test/CodeGen/ARM/favor-low-reg-for-Osize.ll
+++ b/test/CodeGen/ARM/favor-low-reg-for-Osize.ll
@ -0,0 +1,29 @@
+; REQUIRES: asserts
+; RUN: llc -debug-only=regalloc < %s 2>%t | FileCheck %s --check-prefix=CHECK
+; RUN: FileCheck %s < %t --check-prefix=DEBUG
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n8:16:32-S64"
+target triple = "thumbv7m--linux-gnueabi"
+
+
+; DEBUG:         AllocationOrder(GPR) = [ $r0 $r1 $r2 $r3 $r4 $r5 $r6 $r7 $r12 $lr $r8 $r9 $r10 $r11 ]
+
+define i32 @test_minsize(i32 %x) optsize minsize {
+; CHECK-LABEL: test_minsize:
+entry:
+; CHECK: mov     r4, r0
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
+; CHECK: mov     r0, r4
+  ret i32 %x
+}
+
+; DEBUG: AllocationOrder(GPR) = [ $r0 $r1 $r2 $r3 $r12 $lr $r4 $r5 $r6 $r7 $r8 $r9 $r10 $r11 ]
+
+define i32 @test_optsize(i32 %x) optsize {
+; CHECK-LABEL: test_optsize:
+entry:
+; CHECK: mov     r12, r0
+  tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
+; CHECK: mov     r0, r12
+  ret i32 %x
+}