From 1a5573f2577133030073d54cfc0df7b3ed6dc75a Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Sun, 22 Oct 2017 11:56:35 +0000
Subject: [PATCH] [ARM] Dynamic stack alignment for 16-bit Thumb

This patch implements dynamic stack (re-)alignment for 16-bit Thumb. When
targeting processors, which support only the 16-bit Thumb instruction set
the compiler ignores the alignment attributes of automatic variables and may
silently generate incorrect code.

Differential revision: https://reviews.llvm.org/D38143

llvm-svn: 316289
---
 lib/Target/ARM/ARMBaseRegisterInfo.cpp     |  6 +---
 lib/Target/ARM/ARMFrameLowering.cpp        | 14 ++++-----
 lib/Target/ARM/Thumb1FrameLowering.cpp     | 34 +++++++++++++++++++---
 test/CodeGen/ARM/thumb1_return_sequence.ll | 10 +++++--
 test/CodeGen/Thumb/large-stack.ll          |  4 +--
 test/CodeGen/Thumb/long.ll                 |  9 ++++--
 6 files changed, 53 insertions(+), 24 deletions(-)
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index bf39aebaf44..63b14ee98d7 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -391,16 +391,12 @@ bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
 
 bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
   const MachineRegisterInfo *MRI = &MF.getRegInfo();
-  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   const ARMFrameLowering *TFI = getFrameLowering(MF);
   // We can't realign the stack if:
   // 1. Dynamic stack realignment is explicitly disabled,
-  // 2. This is a Thumb1 function (it's not useful, so we don't bother), or
-  // 3. There are VLAs in the function and the base pointer is disabled.
+  // 2. There are VLAs in the function and the base pointer is disabled.
   if (!TargetRegisterInfo::canRealignStack(MF))
     return false;
-  if (AFI->isThumb1OnlyFunction())
-    return false;
   // Stack realignment requires a frame pointer.  If we already started
   // register allocation with frame pointer elimination, it is too late now.
   if (!MRI->canReserveReg(getFramePointerReg(MF.getSubtarget<ARMSubtarget>())))
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 4af744f5ec3..ce4add974d6 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1610,14 +1610,14 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
     if (AFI->getArgRegsSaveSize() > 0)
       SavedRegs.set(ARM::LR);
 
-    // Spill R4 if Thumb1 epilogue has to restore SP from FP. We don't know
-    // for sure what the stack size will be, but for this, an estimate is good
-    // enough. If there anything changes it, it'll be a spill, which implies
-    // we've used all the registers and so R4 is already used, so not marking
-    // it here will be OK.
+    // Spill R4 if Thumb1 epilogue has to restore SP from FP or the function
+    // requires stack alignment.  We don't know for sure what the stack size
+    // will be, but for this, an estimate is good enough. If there anything
+    // changes it, it'll be a spill, which implies we've used all the registers
+    // and so R4 is already used, so not marking it here will be OK.
     // FIXME: It will be better just to find spare register here.
-    unsigned StackSize = MFI.estimateStackSize(MF);
-    if (MFI.hasVarSizedObjects() || StackSize > 508)
+    if (MFI.hasVarSizedObjects() || RegInfo->needsStackRealignment(MF) ||
+        MFI.estimateStackSize(MF) > 508)
       SavedRegs.set(ARM::R4);
   }
 
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 13068992e8f..4f330e3a884 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -352,10 +352,36 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
   AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
   AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
 
-  // Thumb1 does not currently support dynamic stack realignment.  Report a
-  // fatal error rather then silently generate bad code.
-  if (RegInfo->needsStackRealignment(MF))
-      report_fatal_error("Dynamic stack realignment not supported for thumb1.");
+  if (RegInfo->needsStackRealignment(MF)) {
+    const unsigned NrBitsToZero = countTrailingZeros(MFI.getMaxAlignment());
+    // Emit the following sequence, using R4 as a temporary, since we cannot use
+    // SP as a source or destination register for the shifts:
+    // mov  r4, sp
+    // lsrs r4, r4, #NrBitsToZero
+    // lsls r4, r4, #NrBitsToZero
+    // mov  sp, r4
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::R4)
+      .addReg(ARM::SP, RegState::Kill)
+      .add(predOps(ARMCC::AL));
+
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tLSRri), ARM::R4)
+      .addDef(ARM::CPSR)
+      .addReg(ARM::R4, RegState::Kill)
+      .addImm(NrBitsToZero)
+      .add(predOps(ARMCC::AL));
+
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tLSLri), ARM::R4)
+      .addDef(ARM::CPSR)
+      .addReg(ARM::R4, RegState::Kill)
+      .addImm(NrBitsToZero)
+      .add(predOps(ARMCC::AL));
+
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
+      .addReg(ARM::R4, RegState::Kill)
+      .add(predOps(ARMCC::AL));
+
+    AFI->setShouldRestoreSPFromFP(true);
+  }
 
   // If we need a base pointer, set it up here. It's whatever the value
   // of the stack pointer is at this point. Any variable size objects
diff --git a/test/CodeGen/ARM/thumb1_return_sequence.ll b/test/CodeGen/ARM/thumb1_return_sequence.ll
index 67d1cad2cf6..c54712efb39 100644
--- a/test/CodeGen/ARM/thumb1_return_sequence.ll
+++ b/test/CodeGen/ARM/thumb1_return_sequence.ll
@@ -9,6 +9,8 @@ entry:
 ; --------
 ; CHECK-V4T:    push {[[SAVED:(r[4567](, )?)+]], lr}
 ; CHECK-V4T:    sub sp,
+; Stack is realigned because of the <6 x i32> type
+; CHECK-V4T:    mov sp, r4
 ; CHECK-V5T:    push {[[SAVED:(r[4567](, )?)+]], lr}
 
   %b = alloca <6 x i32>, align 16
@@ -21,7 +23,8 @@ entry:
 
 ; Epilogue
 ; --------
-; CHECK-V4T:         add sp,
+; Stack realignment means sp is restored from frame pointer
+; CHECK-V4T:         mov sp
 ; CHECK-V4T-NEXT:    pop {[[SAVED]]}
 ; The ISA for v4 does not support pop pc, so make sure we do not emit
 ; one even when we do not need to update SP.
@@ -70,8 +73,9 @@ entry:
 ; CHECK-V4T-NEXT:    mov lr, [[POP_REG]]
 ; CHECK-V4T-NEXT:    mov [[POP_REG]], r12
 ; CHECK-V4T:         bx  lr
-; CHECK-V5T:         add sp,
-; CHECK-V5T-NEXT:    pop {[[SAVED]]}
+; CHECK-V5T:         lsls r4
+; CHECK-V5T-NEXT:    mov sp, r4
+; CHECK-V5T:         pop {[[SAVED]]}
 ; CHECK-V5T-NEXT:    mov r12, [[POP_REG:r[0-7]]]
 ; CHECK-V5T-NEXT:    pop {[[POP_REG]]}
 ; CHECK-V5T-NEXT:    add sp,
diff --git a/test/CodeGen/Thumb/large-stack.ll b/test/CodeGen/Thumb/large-stack.ll
index b0152ddc4d3..f35bffba5ca 100644
--- a/test/CodeGen/Thumb/large-stack.ll
+++ b/test/CodeGen/Thumb/large-stack.ll
@@ -75,7 +75,7 @@ define i32 @test3() {
 ; CHECK: add sp, [[TEMP3]]
     %retval = alloca i32, align 4
     %tmp = alloca i32, align 4
-    %a = alloca [805306369 x i8], align 16
+    %a = alloca [805306369 x i8], align 4
     store i32 0, i32* %tmp
     %tmp1 = load i32, i32* %tmp
     ret i32 %tmp1
@@ -91,7 +91,7 @@ define i32 @test3_nofpelim() "no-frame-pointer-elim"="true" {
 ; CHECK: mov sp, r4
     %retval = alloca i32, align 4
     %tmp = alloca i32, align 4
-    %a = alloca [805306369 x i8], align 16
+    %a = alloca [805306369 x i8], align 8
     store i32 0, i32* %tmp
     %tmp1 = load i32, i32* %tmp
     ret i32 %tmp1
diff --git a/test/CodeGen/Thumb/long.ll b/test/CodeGen/Thumb/long.ll
index 13951ef4354..7fc46ffb362 100644
--- a/test/CodeGen/Thumb/long.ll
+++ b/test/CodeGen/Thumb/long.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=thumb-eabi %s -verify-machineinstrs -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi %s -verify-machineinstrs -o - | \
+; RUN:    FileCheck %s -check-prefix CHECK --check-prefix CHECK-EABI
 ; RUN: llc -mtriple=thumb-apple-darwin %s -verify-machineinstrs -o - | \
 ; RUN:    FileCheck %s -check-prefix CHECK -check-prefix CHECK-DARWIN
 
@@ -172,10 +173,12 @@ entry:
         %retval = load i64, i64* %a          ; <i64> [#uses=1]
         ret i64 %retval
 ; CHECK-LABEL: f10:
-; CHECK: sub sp, #8
+; CHECK-EABI: sub sp, #8
+; CHECK-DARWIN: add r7, sp, #4
 ; CHECK: ldr r0, [sp]
 ; CHECK: ldr r1, [sp, #4]
-; CHECK: add sp, #8
+; CHECK-EABI: add sp, #8
+; CHECK-DARWIN: mov sp, r4
 }
 
 define i64 @f11(i64 %x, i64 %y) {