[ARM] Improve if-conversion for M-class CPUs without branch predictors

The current heuristic in isProfitableToIfCvt assumes we have a branch predictor, and so gives the wrong answer in some cases when we don't. This patch adds a subtarget feature to indicate that a subtarget has no branch predictor, and changes the heuristic in isProfitableToiIfCvt when it's present. This gives a slight overall improvement in a set of embedded benchmarks on Cortex-M4 and Cortex-M33. Differential Revision: https://reviews.llvm.org/D34398 llvm-svn: 306547
2024-11-23 03:02:36 +01:00 · 2017-06-28 14:11:15 +00:00 · 2017-06-28 14:11:15 +00:00 · 3ad0317ae0
commit 3ad0317ae0
parent ed751c8384
6 changed files with 240 additions and 15 deletions
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@ -222,6 +222,13 @@ def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop",
 def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true",
                                     "Has return address stack">;

+// Some processors have no branch predictor, which changes the expected cost of
+// taking a branch which affects the choice of whether to use predicated
+// instructions.
+def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor",
+                                                   "HasBranchPredictor", "false",
+                                                   "Has no branch predictor">;
+
 /// DSP extension.
 def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true",
                              "Supports DSP instructions in ARM and/or Thumb2">;
@ -756,13 +763,19 @@ def : ProcessorModel<"cortex-r8",   CortexA8Model,      [ARMv7r,
                                                         FeatureHasSlowFPVMLx,
                                                         FeatureAvoidPartialCPSR]>;

-def : ProcNoItin<"cortex-m3",                           [ARMv7m, ProcM3]>;
-def : ProcNoItin<"sc300",                               [ARMv7m, ProcM3]>;
+def : ProcessorModel<"cortex-m3", CortexM3Model,        [ARMv7m,
+                                                         ProcM3,
+                                                         FeatureHasNoBranchPredictor]>;

-def : ProcNoItin<"cortex-m4",                           [ARMv7em,
+def : ProcessorModel<"sc300",     CortexM3Model,        [ARMv7m,
+                                                         ProcM3,
+                                                         FeatureHasNoBranchPredictor]>;
+
+def : ProcessorModel<"cortex-m4", CortexM3Model,        [ARMv7em,
                                                         FeatureVFP4,
                                                         FeatureVFPOnlySP,
-                                                         FeatureD16]>;
+                                                         FeatureD16,
+                                                         FeatureHasNoBranchPredictor]>;

 def : ProcNoItin<"cortex-m7",                           [ARMv7em,
                                                         FeatureFPARMv8,
@ -771,11 +784,12 @@ def : ProcNoItin<"cortex-m7",                           [ARMv7em,
 def : ProcNoItin<"cortex-m23",                          [ARMv8mBaseline,
                                                         FeatureNoMovt]>;

-def : ProcNoItin<"cortex-m33",                          [ARMv8mMainline,
+def : ProcessorModel<"cortex-m33", CortexM3Model,       [ARMv8mMainline,
                                                         FeatureDSP,
                                                         FeatureFPARMv8,
                                                         FeatureD16,
-                                                         FeatureVFPOnlySP]>;
+                                                         FeatureVFPOnlySP,
+                                                         FeatureHasNoBranchPredictor]>;

 def : ProcNoItin<"cortex-a32",                           [ARMv8a,
                                                         FeatureHWDivThumb,
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@ -1851,9 +1851,9 @@ isProfitableToIfCvt(MachineBasicBlock &MBB,
 }

 bool ARMBaseInstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &,
+isProfitableToIfCvt(MachineBasicBlock &TBB,
                    unsigned TCycles, unsigned TExtra,
-                    MachineBasicBlock &,
+                    MachineBasicBlock &FBB,
                    unsigned FCycles, unsigned FExtra,
                    BranchProbability Probability) const {
  if (!TCycles)
@ -1863,14 +1863,43 @@ isProfitableToIfCvt(MachineBasicBlock &,
  // Here we scale up each component of UnpredCost to avoid precision issue when
  // scaling TCycles/FCycles by Probability.
  const unsigned ScalingUpFactor = 1024;
-  unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
-  unsigned FUnpredCost =
-      Probability.getCompl().scale(FCycles * ScalingUpFactor);
-  unsigned UnpredCost = TUnpredCost + FUnpredCost;
-  UnpredCost += 1 * ScalingUpFactor; // The branch itself
-  UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;

-  return (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor <= UnpredCost;
+  unsigned PredCost = (TCycles + FCycles + TExtra + FExtra) * ScalingUpFactor;
+  unsigned UnpredCost;
+  if (!Subtarget.hasBranchPredictor()) {
+    // When we don't have a branch predictor it's always cheaper to not take a
+    // branch than take it, so we have to take that into account.
+    unsigned NotTakenBranchCost = 1;
+    unsigned TakenBranchCost = Subtarget.getMispredictionPenalty();
+    unsigned TUnpredCycles, FUnpredCycles;
+    if (!FCycles) {
+      // Triangle: TBB is the fallthrough
+      TUnpredCycles = TCycles + NotTakenBranchCost;
+      FUnpredCycles = TakenBranchCost;
+    } else {
+      // Diamond: TBB is the block that is branched to, FBB is the fallthrough
+      TUnpredCycles = TCycles + TakenBranchCost;
+      FUnpredCycles = FCycles + NotTakenBranchCost;
+    }
+    // The total cost is the cost of each path scaled by their probabilites
+    unsigned TUnpredCost = Probability.scale(TUnpredCycles * ScalingUpFactor);
+    unsigned FUnpredCost = Probability.getCompl().scale(FUnpredCycles * ScalingUpFactor);
+    UnpredCost = TUnpredCost + FUnpredCost;
+    // When predicating assume that the first IT can be folded away but later
+    // ones cost one cycle each
+    if (Subtarget.isThumb2() && TCycles + FCycles > 4) {
+      PredCost += ((TCycles + FCycles - 4) / 4) * ScalingUpFactor;
+    }
+  } else {
+    unsigned TUnpredCost = Probability.scale(TCycles * ScalingUpFactor);
+    unsigned FUnpredCost =
+      Probability.getCompl().scale(FCycles * ScalingUpFactor);
+    UnpredCost = TUnpredCost + FUnpredCost;
+    UnpredCost += 1 * ScalingUpFactor; // The branch itself
+    UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
+  }
+
+  return PredCost <= UnpredCost;
 }

 bool
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@ -424,3 +424,4 @@ include "ARMScheduleA9.td"
 include "ARMScheduleSwift.td"
 include "ARMScheduleR52.td"
 include "ARMScheduleA57.td"
+include "ARMScheduleM3.td"
--- a/lib/Target/ARM/ARMScheduleM3.td
+++ b/lib/Target/ARM/ARMScheduleM3.td
@ -0,0 +1,21 @@
+//=- ARMScheduleM3.td - ARM Cortex-M3 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for the ARM Cortex-M3 processor.
+//
+//===----------------------------------------------------------------------===//
+
+def CortexM3Model : SchedMachineModel {
+  let IssueWidth        = 1; // Only IT can be dual-issued, so assume single-issue
+  let MicroOpBufferSize = 0; // In-order
+  let LoadLatency       = 2; // Latency when not pipelined, not pc-relative
+  let MispredictPenalty = 2; // Best case branch taken cost
+
+  let CompleteModel = 0;
+}
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@ -246,6 +246,11 @@ protected:
  /// avoid issue "normal" call instructions to callees which do not return.
  bool HasRetAddrStack = false;

+  /// HasBranchPredictor - True if the subtarget has a branch predictor. Having
+  /// a branch predictor or not changes the expected cost of taking a branch
+  /// which affects the choice of whether to use predicated instructions.
+  bool HasBranchPredictor = true;
+
  /// HasMPExtension - True if the subtarget supports Multiprocessing
  /// extension (ARMv7 only).
  bool HasMPExtension = false;
@ -554,6 +559,7 @@ public:
  bool cheapPredicableCPSRDef() const { return CheapPredicableCPSRDef; }
  bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; }
  bool hasRetAddrStack() const { return HasRetAddrStack; }
+  bool hasBranchPredictor() const { return HasBranchPredictor; }
  bool hasMPExtension() const { return HasMPExtension; }
  bool hasDSP() const { return HasDSP; }
  bool useNaClTrap() const { return UseNaClTrap; }
--- a/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll
+++ b/test/CodeGen/Thumb2/ifcvt-no-branch-predictor.ll
@ -0,0 +1,154 @@
+; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m7 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BP
+; RUN: llc < %s -mtriple=thumbv7m -mcpu=cortex-m3 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-NOBP
+
+declare void @otherfn()
+
+; CHECK-LABEL: triangle1:
+; CHECK: itt ne
+; CHECK: movne
+; CHECK: strne
+define i32 @triangle1(i32 %n, i32* %p) {
+entry:
+  %tobool = icmp eq i32 %n, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  store i32 1, i32* %p, align 4
+  br label %if.end
+
+if.end:
+  tail call void @otherfn()
+  ret i32 0
+}
+
+; CHECK-LABEL: triangle2:
+; CHECK-BP: itttt ne
+; CHECK-BP: movne
+; CHECK-BP: strne
+; CHECK-BP: movne
+; CHECK-BP: strne
+; CHECK-NOBP: cbz
+; CHECK-NOBP: movs
+; CHECK-NOBP: str
+; CHECK-NOBP: movs
+; CHECK-NOBP: str
+define i32 @triangle2(i32 %n, i32* %p, i32* %q) {
+entry:
+  %tobool = icmp eq i32 %n, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  store i32 1, i32* %p, align 4
+  store i32 2, i32* %q, align 4
+  br label %if.end
+
+if.end:
+  tail call void @otherfn()
+  ret i32 0
+}
+
+; CHECK-LABEL: triangle3:
+; CHECK: cbz
+; CHECK: movs
+; CHECK: str
+; CHECK: movs
+; CHECK: str
+; CHECK: movs
+; CHECK: str
+define i32 @triangle3(i32 %n, i32* %p, i32* %q, i32* %r) {
+entry:
+  %tobool = icmp eq i32 %n, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  store i32 1, i32* %p, align 4
+  store i32 2, i32* %q, align 4
+  store i32 3, i32* %r, align 4
+  br label %if.end
+
+if.end:
+  tail call void @otherfn()
+  ret i32 0
+}
+
+; CHECK-LABEL: diamond1:
+; CHECK: ite eq
+; CHECK: ldreq
+; CHECK: strne
+define i32 @diamond1(i32 %n, i32* %p) {
+entry:
+  %tobool = icmp eq i32 %n, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  store i32 %n, i32* %p, align 4
+  br label %if.end
+
+if.else:
+  %0 = load i32, i32* %p, align 4
+  br label %if.end
+
+if.end:
+  %n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ]
+  tail call void @otherfn()
+  ret i32 %n.addr.0
+}
+
+; CHECK-LABEL: diamond2:
+; CHECK-BP: itte
+; CHECK-BP: streq
+; CHECK-BP: ldreq
+; CHECK-BP: strne
+; CHECK-NOBP: cbz
+; CHECK-NOBP: str
+; CHECK-NOBP: b
+; CHECK-NOBP: str
+; CHECK-NOBP: ldr
+define i32 @diamond2(i32 %n, i32 %m, i32* %p, i32* %q) {
+entry:
+  %tobool = icmp eq i32 %n, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  store i32 %n, i32* %p, align 4
+  br label %if.end
+
+if.else:
+  store i32 %m, i32* %q, align 4
+  %0 = load i32, i32* %p, align 4
+  br label %if.end
+
+if.end:
+  %n.addr.0 = phi i32 [ %n, %if.then ], [ %0, %if.else ]
+  tail call void @otherfn()
+  ret i32 %n.addr.0
+}
+
+; CHECK-LABEL: diamond3:
+; CHECK: cbz
+; CHECK: movs
+; CHECK: str
+; CHECK: b
+; CHECK: ldr
+; CHECK: ldr
+; CHECK: adds
+define i32 @diamond3(i32 %n, i32* %p, i32* %q) {
+entry:
+  %tobool = icmp eq i32 %n, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  store i32 1, i32* %p, align 4
+  br label %if.end
+
+if.else:
+  %0 = load i32, i32* %p, align 4
+  %1 = load i32, i32* %q, align 4
+  %add = add nsw i32 %1, %0
+  br label %if.end
+
+if.end:
+  %n.addr.0 = phi i32 [ %n, %if.then ], [ %add, %if.else ]
+  tail call void @otherfn()
+  ret i32 %n.addr.0
+}