From 0ebd182c7a77a9abf4179ad7c356d47f9db3e237 Mon Sep 17 00:00:00 2001
From: Hal Finkel <hfinkel@anl.gov>
Date: Tue, 1 Apr 2014 18:50:34 +0000
Subject: [PATCH] Implement X86TTI::getUnrollingPreferences

This provides an initial implementation of getUnrollingPreferences for x86.
getUnrollingPreferences is used by the generic (concatenation) unroller, which
is distinct from the unrolling done by the loop vectorizer. Many modern x86
cores have some kind of uop cache and loop-stream detector (LSD) used to
efficiently dispatch small loops, and taking full advantage of this requires
unrolling small loops (small here means 10s of uops).

These caches also have limits on the number of taken branches in the loop, and
so we also cap the loop unrolling factor based on the maximum "depth" of the
loop. This is currently calculated with a partial DFS traversal (partial
because it will stop early if the path length grows too much). This is still an
approximation, and one that is both conservative (because it does not account
for branches eliminated via block placement) and optimistic (because it is only
recording the maximum depth over minimum paths). Nevertheless, because the
loops that fit in these uop caches are so small, it is not clear how much the
details matter.

The original set of patches posted for review produced the following test-suite
performance results (from the TSVC benchmark) at that time:
  ControlLoops-dbl - 13% speedup
  ControlLoops-flt - 15% speedup
  Reductions-dbl - 7.5% speedup

llvm-svn: 205348
---
 lib/Target/X86/X86TargetTransformInfo.cpp     | 103 ++++++++++++++++++
 test/Transforms/LoopUnroll/X86/lit.local.cfg  |   4 +
 test/Transforms/LoopUnroll/X86/partial.ll     |  80 ++++++++++++++
 .../LoopVectorize/X86/metadata-enable.ll      |  20 ++--
 4 files changed, 197 insertions(+), 10 deletions(-)
 create mode 100644 test/Transforms/LoopUnroll/X86/lit.local.cfg
 create mode 100644 test/Transforms/LoopUnroll/X86/partial.ll
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index ed04cdc4e40..437f63d3280 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -17,8 +17,11 @@
 #define DEBUG_TYPE "x86tti"
 #include "X86.h"
 #include "X86TargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
@@ -31,6 +34,17 @@ namespace llvm {
 void initializeX86TTIPass(PassRegistry &);
 }
 
+static cl::opt<bool>
+UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true),
+  cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden);
+static cl::opt<unsigned>
+PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0),
+  cl::desc("Threshold for X86 partial unrolling"), cl::Hidden);
+static cl::opt<unsigned>
+PartialUnrollingMaxBranches("x86-partial-max-branches", cl::init(2),
+  cl::desc("Threshold for taken branches in X86 partial unrolling"),
+  cl::Hidden);
+
 namespace {
 
 class X86TTI final : public ImmutablePass, public TargetTransformInfo {
@@ -73,6 +87,8 @@ public:
   /// \name Scalar TTI Implementations
   /// @{
   PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
+  void getUnrollingPreferences(Loop *L,
+                               UnrollingPreferences &UP) const override;
 
   /// @}
 
@@ -137,6 +153,93 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
   return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
 }
 
+void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
+  if (!UsePartialUnrolling)
+    return;
+  // According to the Intel 64 and IA-32 Architectures Optimization Reference
+  // Manual, Intel Core models and later have a loop stream detector
+  // (and associated uop queue) that can benefit from partial unrolling.
+  // The relevant requirements are:
+  //  - The loop must have no more than 4 (8 for Nehalem and later) branches
+  //    taken, and none of them may be calls.
+  //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
+
+  // According to the Software Optimization Guide for AMD Family 15h Processors,
+  // models 30h-4fh (Steamroller and later) have a loop predictor and loop
+  // buffer which can benefit from partial unrolling.
+  // The relevant requirements are:
+  //  - The loop must have fewer than 16 branches
+  //  - The loop must have less than 40 uops in all executed loop branches
+
+  unsigned MaxBranches, MaxOps;
+  if (PartialUnrollingThreshold.getNumOccurrences() > 0) {
+    MaxBranches = PartialUnrollingMaxBranches;
+    MaxOps = PartialUnrollingThreshold;
+  } else if (ST->isAtom()) {
+    // On the Atom, the throughput for taken branches is 2 cycles. For small
+    // simple loops, expand by a small factor to hide the backedge cost.
+    MaxBranches = 2;
+    MaxOps = 10;
+  } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) {
+    MaxBranches = 16;
+    MaxOps = 40;
+  } else if (ST->hasFMA4() /* Any other recent AMD */) {
+    return;
+  } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) {
+    MaxBranches = 8;
+    MaxOps = 28;
+  } else if (ST->hasSSSE3() /* Intel Core */) {
+    MaxBranches = 4;
+    MaxOps = 18;
+  } else {
+    return;
+  }
+
+  // Scan the loop: don't unroll loops with calls, and count the potential
+  // number of taken branches (this is somewhat conservative because we're
+  // counting all block transitions as potential branches while in reality some
+  // of these will become implicit via block placement).
+  unsigned MaxDepth = 0;
+  for (df_iterator<BasicBlock*> DI = df_begin(L->getHeader()),
+       DE = df_end(L->getHeader()); DI != DE;) {
+    if (!L->contains(*DI)) {
+      DI.skipChildren();
+      continue;
+    }
+
+    MaxDepth = std::max(MaxDepth, DI.getPathLength());
+    if (MaxDepth > MaxBranches)
+      return;
+
+    for (BasicBlock::iterator I = DI->begin(), IE = DI->end(); I != IE; ++I)
+      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+        ImmutableCallSite CS(I);
+        if (const Function *F = CS.getCalledFunction()) {
+          if (!isLoweredToCall(F))
+            continue;
+        }
+
+        return;
+      }
+
+    ++DI;
+  }
+
+  // Enable runtime and partial unrolling up to the specified size.
+  UP.Partial = UP.Runtime = true;
+  UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
+
+  // Set the maximum count based on the loop depth. The maximum number of
+  // branches taken in a loop (including the backedge) is equal to the maximum
+  // loop depth (the DFS path length from the loop header to any block in the
+  // loop). When the loop is unrolled, this depth (except for the backedge
+  // itself) is multiplied by the unrolling factor. This new unrolled depth
+  // must be less than the target-specific maximum branch count (which limits
+  // the number of taken branches in the uop buffer).
+  if (MaxDepth > 1)
+    UP.MaxCount = (MaxBranches-1)/(MaxDepth-1);
+}
+
 unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
   if (Vector && !ST->hasSSE1())
     return 0;
diff --git a/test/Transforms/LoopUnroll/X86/lit.local.cfg b/test/Transforms/LoopUnroll/X86/lit.local.cfg
new file mode 100644
index 00000000000..ba763cf03ff
--- /dev/null
+++ b/test/Transforms/LoopUnroll/X86/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/LoopUnroll/X86/partial.ll b/test/Transforms/LoopUnroll/X86/partial.ll
new file mode 100644
index 00000000000..15867cbea0a
--- /dev/null
+++ b/test/Transforms/LoopUnroll/X86/partial.ll
@@ -0,0 +1,80 @@
+; RUN: opt < %s -S -loop-unroll -mcpu=nehalem -x86-use-partial-unrolling=1 | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -mcpu=core -x86-use-partial-unrolling=1 | FileCheck -check-prefix=CHECK-NOUNRL %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i32* noalias nocapture readnone %ip, double %alpha, double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 {
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds double* %b, i64 %index
+  %1 = bitcast double* %0 to <2 x double>*
+  %wide.load = load <2 x double>* %1, align 8
+  %.sum9 = or i64 %index, 2
+  %2 = getelementptr double* %b, i64 %.sum9
+  %3 = bitcast double* %2 to <2 x double>*
+  %wide.load8 = load <2 x double>* %3, align 8
+  %4 = fadd <2 x double> %wide.load, <double 1.000000e+00, double 1.000000e+00>
+  %5 = fadd <2 x double> %wide.load8, <double 1.000000e+00, double 1.000000e+00>
+  %6 = getelementptr inbounds double* %a, i64 %index
+  %7 = bitcast double* %6 to <2 x double>*
+  store <2 x double> %4, <2 x double>* %7, align 8
+  %.sum10 = or i64 %index, 2
+  %8 = getelementptr double* %a, i64 %.sum10
+  %9 = bitcast double* %8 to <2 x double>*
+  store <2 x double> %5, <2 x double>* %9, align 8
+  %index.next = add i64 %index, 4
+  %10 = icmp eq i64 %index.next, 1600
+  br i1 %10, label %for.end, label %vector.body
+
+; FIXME: We should probably unroll this loop by a factor of 2, but the cost
+; model needs to be fixed to account for instructions likely to be folded
+; as part of an addressing mode.
+; CHECK-LABEL: @foo
+; CHECK-NOUNRL-LABEL: @foo
+
+for.end:                                          ; preds = %vector.body
+  ret void
+}
+
+define void @bar(i32* noalias nocapture readnone %ip, double %alpha, double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 {
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %v0 = getelementptr inbounds double* %b, i64 %index
+  %v1 = bitcast double* %v0 to <2 x double>*
+  %wide.load = load <2 x double>* %v1, align 8
+  %v4 = fadd <2 x double> %wide.load, <double 1.000000e+00, double 1.000000e+00>
+  %v5 = fmul <2 x double> %v4, <double 8.000000e+00, double 8.000000e+00>
+  %v6 = getelementptr inbounds double* %a, i64 %index
+  %v7 = bitcast double* %v6 to <2 x double>*
+  store <2 x double> %v5, <2 x double>* %v7, align 8
+  %index.next = add i64 %index, 2
+  %v10 = icmp eq i64 %index.next, 1600
+  br i1 %v10, label %for.end, label %vector.body
+
+; FIXME: We should probably unroll this loop by a factor of 2, but the cost
+; model needs to first to fixed to account for instructions likely to be folded
+; as part of an addressing mode.
+
+; CHECK-LABEL: @bar
+; CHECK: fadd
+; CHECK-NEXT: fmul
+; CHECK: fadd
+; CHECK-NEXT: fmul
+
+; CHECK-NOUNRL-LABEL: @bar
+; CHECK-NOUNRL: fadd
+; CHECK-NOUNRL-NEXT: fmul
+; CHECK-NOUNRL-NOT: fadd
+
+for.end:                                          ; preds = %vector.body
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
+
diff --git a/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index e98a4acddea..224823b8ed5 100644
--- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -1,13 +1,13 @@
-; RUN: opt < %s -mcpu=corei7 -O1 -S | FileCheck %s --check-prefix=O1
-; RUN: opt < %s -mcpu=corei7 -O2 -S | FileCheck %s --check-prefix=O2
-; RUN: opt < %s -mcpu=corei7 -O3 -S | FileCheck %s --check-prefix=O3
-; RUN: opt < %s -mcpu=corei7 -Os -S | FileCheck %s --check-prefix=Os
-; RUN: opt < %s -mcpu=corei7 -Oz -S | FileCheck %s --check-prefix=Oz
-; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S | FileCheck %s --check-prefix=O1VEC
-; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S | FileCheck %s --check-prefix=OzVEC
-; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S | FileCheck %s --check-prefix=O1VEC2
-; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S | FileCheck %s --check-prefix=OzVEC2
-; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S | FileCheck %s --check-prefix=O3DIS
+; RUN: opt < %s -mcpu=corei7 -O1 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1
+; RUN: opt < %s -mcpu=corei7 -O2 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O2
+; RUN: opt < %s -mcpu=corei7 -O3 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3
+; RUN: opt < %s -mcpu=corei7 -Os -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Os
+; RUN: opt < %s -mcpu=corei7 -Oz -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Oz
+; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC
+; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC
+; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC2
+; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC2
+; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3DIS
 
 ; This file tests the llvm.vectorizer.pragma forcing vectorization even when
 ; optimization levels are too low, or when vectorization is disabled.