From 2d0165e75c00d995b9a89ffa042689b7fc99be78 Mon Sep 17 00:00:00 2001
From: James Molloy <james.molloy@arm.com>
Date: Fri, 15 Jul 2016 08:03:56 +0000
Subject: [PATCH] [Thumb-1] Select post-increment load and store where possible

Thumb-1 doesn't have post-inc or pre-inc load or store instructions. However the LDM/STM instructions with writeback can function as post-inc load/store:

  ldm r0!, {r1}  @ load from r0 into r1 and increment r0 by 4

Obviously, this only works if the post increment is 4.

llvm-svn: 275540
---
 lib/Target/ARM/ARMISelDAGToDAG.cpp    | 29 ++++++++++
 lib/Target/ARM/ARMISelLowering.cpp    | 40 +++++++++++--
 lib/Target/ARM/ARMInstrThumb.td       | 18 ++++++
 test/CodeGen/Thumb/ldm-stm-postinc.ll | 81 +++++++++++++++++++++++++++
 4 files changed, 164 insertions(+), 4 deletions(-)
 create mode 100644 test/CodeGen/Thumb/ldm-stm-postinc.ll
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 1b1b7751c6c..c5e1d976bbb 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -195,6 +195,7 @@ public:
 private:
   /// Indexed (pre/post inc/dec) load matching code for ARM.
   bool tryARMIndexedLoad(SDNode *N);
+  bool tryT1IndexedLoad(SDNode *N);
   bool tryT2IndexedLoad(SDNode *N);
 
   /// SelectVLD - Select NEON load intrinsics.  NumVecs should be
@@ -1543,6 +1544,31 @@ bool ARMDAGToDAGISel::tryARMIndexedLoad(SDNode *N) {
   return false;
 }
 
+bool ARMDAGToDAGISel::tryT1IndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT LoadedVT = LD->getMemoryVT();
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM == ISD::UNINDEXED || LD->getExtensionType() != ISD::NON_EXTLOAD ||
+      AM != ISD::POST_INC || LoadedVT.getSimpleVT().SimpleTy != MVT::i32)
+    return false;
+
+  auto *COffs = dyn_cast<ConstantSDNode>(LD->getOffset());
+  if (!COffs || COffs->getZExtValue() != 4)
+    return false;
+
+  // A T1 post-indexed load is just a single register LDM: LDM r0!, {r1}.
+  // The encoding of LDM is not how the rest of ISel expects a post-inc load to
+  // look however, so we use a pseudo here and switch it for a tLDMIA_UPD after
+  // ISel.
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  SDValue Ops[]= { Base, getAL(CurDAG, SDLoc(N)),
+                   CurDAG->getRegister(0, MVT::i32), Chain };
+  ReplaceNode(N, CurDAG->getMachineNode(ARM::tLDR_postidx, SDLoc(N), MVT::i32, MVT::i32,
+                                        MVT::Other, Ops));
+  return true;
+}
+
 bool ARMDAGToDAGISel::tryT2IndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
@@ -3015,6 +3041,9 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     if (Subtarget->isThumb() && Subtarget->hasThumb2()) {
       if (tryT2IndexedLoad(N))
         return;
+    } else if (Subtarget->isThumb()) {
+      if (tryT1IndexedLoad(N))
+        return;
     } else if (tryARMIndexedLoad(N))
       return;
     // Other cases are autogenerated.
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 12ed20c2ea4..1d07e773f9d 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -715,6 +715,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       setIndexedStoreAction(im, MVT::i16, Legal);
       setIndexedStoreAction(im, MVT::i32, Legal);
     }
+  } else {
+    // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
+    setIndexedLoadAction(ISD::POST_INC, MVT::i32,  Legal);
+    setIndexedStoreAction(ISD::POST_INC, MVT::i32,  Legal);
   }
 
   setOperationAction(ISD::SADDO, MVT::i32, Custom);
@@ -8247,6 +8251,19 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.dump();
     llvm_unreachable("Unexpected instr type to insert");
   }
+
+  // Thumb1 post-indexed loads are really just single-register LDMs.
+  case ARM::tLDR_postidx: {
+    BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
+      .addOperand(MI->getOperand(1)) // Rn_wb
+      .addOperand(MI->getOperand(2)) // Rn
+      .addOperand(MI->getOperand(3)) // PredImm
+      .addOperand(MI->getOperand(4)) // PredReg
+      .addOperand(MI->getOperand(0)); // Rt
+    MI->eraseFromParent();
+    return BB;
+  }
+
   // The Thumb2 pre-indexed stores have the same MI operands, they just
   // define them differently in the .td files from the isel patterns, so
   // they need pseudos.
@@ -11596,22 +11613,37 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
                                                    SDValue &Offset,
                                                    ISD::MemIndexedMode &AM,
                                                    SelectionDAG &DAG) const {
-  if (Subtarget->isThumb1Only())
-    return false;
-
   EVT VT;
   SDValue Ptr;
-  bool isSEXTLoad = false;
+  bool isSEXTLoad = false, isNonExt;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     VT  = LD->getMemoryVT();
     Ptr = LD->getBasePtr();
     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+    isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     VT  = ST->getMemoryVT();
     Ptr = ST->getBasePtr();
+    isNonExt = !ST->isTruncatingStore();
   } else
     return false;
 
+  if (Subtarget->isThumb1Only()) {
+    // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
+    // must be non-extending/truncating, i32, with an offset of 4.
+    assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
+    if (Op->getOpcode() != ISD::ADD || !isNonExt)
+      return false;
+    auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+    if (!RHS || RHS->getZExtValue() != 4)
+      return false;
+    
+    Offset = Op->getOperand(1);
+    Base = Op->getOperand(0);
+    AM = ISD::POST_INC;
+    return true;
+  }
+  
   bool isInc;
   bool isLegal = false;
   if (Subtarget->isThumb2())
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index bec1ea8763d..93a174f3678 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -1451,6 +1451,24 @@ def : T1Pat<(extloadi8  t_addrmode_rr:$addr),  (tLDRBr t_addrmode_rr:$addr)>;
 def : T1Pat<(extloadi16 t_addrmode_is2:$addr), (tLDRHi t_addrmode_is2:$addr)>;
 def : T1Pat<(extloadi16 t_addrmode_rr:$addr),  (tLDRHr t_addrmode_rr:$addr)>;
 
+// post-inc loads and stores
+
+// post-inc LDR -> LDM r0!, {r1}. The way operands are layed out in LDMs is
+// different to how ISel expects them for a post-inc load, so use a pseudo
+// and expand it just after ISel.
+let usesCustomInserter = 1,
+    Constraints = "$Rn = $Rn_wb,@earlyclobber $Rn_wb" in
+ def tLDR_postidx: tPseudoInst<(outs rGPR:$Rt, rGPR:$Rn_wb),
+                               (ins rGPR:$Rn, pred:$p),
+                               4, IIC_iStore_ru,
+                               []>;
+
+// post-inc STR -> STM r0!, {r1}. The layout of this (because it doesn't def
+// multiple registers) is the same in ISel as MachineInstr, so there's no need
+// for a pseudo.
+def : T1Pat<(post_store rGPR:$Rt, rGPR:$Rn, 4),
+            (tSTMIA_UPD rGPR:$Rn, rGPR:$Rt)>;
+
 // If it's impossible to use [r,r] address mode for sextload, select to
 // ldr{b|h} + sxt{b|h} instead.
 def : T1Pat<(sextloadi8 t_addrmode_is1:$addr),
diff --git a/test/CodeGen/Thumb/ldm-stm-postinc.ll b/test/CodeGen/Thumb/ldm-stm-postinc.ll
new file mode 100644
index 00000000000..f2e222bd5b9
--- /dev/null
+++ b/test/CodeGen/Thumb/ldm-stm-postinc.ll
@@ -0,0 +1,81 @@
+; RUN: llc -mtriple=thumbv7 -mcpu=cortex-m0 < %s -disable-lsr | FileCheck %s
+; FIXME: LSR mangles the last two testcases pretty badly. When this is fixed, remove
+; the -disable-lsr above.
+
+; CHECK-LABEL: @f
+; CHECK: ldm {{r[0-9]}}!, {r{{[0-9]}}}
+define i32 @f(i32* readonly %a, i32* readnone %b) {
+  %1 = icmp eq i32* %a, %b
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph, %0
+  %i.02 = phi i32 [ %3, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi i32* [ %4, %.lr.ph ], [ %a, %0 ]
+  %2 = load i32, i32* %.01, align 4
+  %3 = add nsw i32 %2, %i.02
+  %4 = getelementptr inbounds i32, i32* %.01, i32 1
+  %5 = icmp eq i32* %4, %b
+  br i1 %5, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %i.0.lcssa = phi i32 [ 0, %0 ], [ %3, %.lr.ph ]
+  ret i32 %i.0.lcssa
+}
+
+; CHECK-LABEL: @g
+; CHECK-NOT: ldm
+define i32 @g(i32* readonly %a, i32* readnone %b) {
+  %1 = icmp eq i32* %a, %b
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph, %0
+  %i.02 = phi i32 [ %3, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi i32* [ %4, %.lr.ph ], [ %a, %0 ]
+  %2 = load i32, i32* %.01, align 4
+  %3 = add nsw i32 %2, %i.02
+  %4 = getelementptr inbounds i32, i32* %.01, i32 2
+  %5 = icmp eq i32* %4, %b
+  br i1 %5, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %i.0.lcssa = phi i32 [ 0, %0 ], [ %3, %.lr.ph ]
+  ret i32 %i.0.lcssa
+}
+
+; CHECK-LABEL: @h
+; CHECK: stm {{r[0-9]}}!, {r{{[0-9]}}}
+define void @h(i32* %a, i32* readnone %b) {
+  %1 = icmp eq i32* %a, %b
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph, %0
+  %i.02 = phi i32 [ %2, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi i32* [ %3, %.lr.ph ], [ %a, %0 ]
+  %2 = add nsw i32 %i.02, 1
+  store i32 %i.02, i32* %.01, align 4
+  %3 = getelementptr inbounds i32, i32* %.01, i32 1
+  %4 = icmp eq i32* %3, %b
+  br i1 %4, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+; CHECK-LABEL: @j
+; CHECK-NOT: stm
+define void @j(i32* %a, i32* readnone %b) {
+  %1 = icmp eq i32* %a, %b
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph, %0
+  %i.02 = phi i32 [ %2, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi i32* [ %3, %.lr.ph ], [ %a, %0 ]
+  %2 = add nsw i32 %i.02, 1
+  store i32 %i.02, i32* %.01, align 4
+  %3 = getelementptr inbounds i32, i32* %.01, i32 2
+  %4 = icmp eq i32* %3, %b
+  br i1 %4, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}