1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00

[LoopIdiomRecognize] Recommit: BCmp loop idiom recognition

Summary:
This is a recommit, this originally landed in rL370454 but was
subsequently reverted in  rL370788 due to
https://bugs.llvm.org/show_bug.cgi?id=43206
The reduced testcase was added to bcmp-negative-tests.ll
as @pr43206_different_loops - we must ensure that the SCEV's
we got are both for the same loop we are currently investigating.

Original commit message:

@mclow.lists brought up this issue up in IRC.
It is a reasonably common problem to compare some two values for equality.
Those may be just some integers, strings or arrays of integers.

In C, there is `memcmp()`, `bcmp()` functions.
In C++, there exists `std::equal()` algorithm.
One can also write that function manually.

libstdc++'s `std::equal()` is specialized to directly call `memcmp()` for
various types, but not `std::byte` from C++2a. https://godbolt.org/z/mx2ejJ

libc++ does not do anything like that, it simply relies on simple C++'s
`operator==()`. https://godbolt.org/z/er0Zwf (GOOD!)

So likely, there exists a certain performance opportunities.
Let's compare performance of naive `std::equal()` (no `memcmp()`) with one that
is using `memcmp()` (in this case, compiled with modified compiler). {F8768213}

```
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <iterator>
#include <limits>
#include <random>
#include <type_traits>
#include <utility>
#include <vector>

#include "benchmark/benchmark.h"

template <class T>
bool equal(T* a, T* a_end, T* b) noexcept {
  for (; a != a_end; ++a, ++b) {
    if (*a != *b) return false;
  }
  return true;
}

template <typename T>
std::vector<T> getVectorOfRandomNumbers(size_t count) {
  std::random_device rd;
  std::mt19937 gen(rd());
  std::uniform_int_distribution<T> dis(std::numeric_limits<T>::min(),
                                       std::numeric_limits<T>::max());
  std::vector<T> v;
  v.reserve(count);
  std::generate_n(std::back_inserter(v), count,
                  [&dis, &gen]() { return dis(gen); });
  assert(v.size() == count);
  return v;
}

struct Identical {
  template <typename T>
  static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
    auto Tmp = getVectorOfRandomNumbers<T>(count);
    return std::make_pair(Tmp, std::move(Tmp));
  }
};

struct InequalHalfway {
  template <typename T>
  static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) {
    auto V0 = getVectorOfRandomNumbers<T>(count);
    auto V1 = V0;
    V1[V1.size() / size_t(2)]++;  // just change the value.
    return std::make_pair(std::move(V0), std::move(V1));
  }
};

template <class T, class Gen>
void BM_bcmp(benchmark::State& state) {
  const size_t Length = state.range(0);

  const std::pair<std::vector<T>, std::vector<T>> Data =
      Gen::template Gen<T>(Length);
  const std::vector<T>& a = Data.first;
  const std::vector<T>& b = Data.second;
  assert(a.size() == Length && b.size() == a.size());

  benchmark::ClobberMemory();
  benchmark::DoNotOptimize(a);
  benchmark::DoNotOptimize(a.data());
  benchmark::DoNotOptimize(b);
  benchmark::DoNotOptimize(b.data());

  for (auto _ : state) {
    const bool is_equal = equal(a.data(), a.data() + a.size(), b.data());
    benchmark::DoNotOptimize(is_equal);
  }
  state.SetComplexityN(Length);
  state.counters["eltcnt"] =
      benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariant);
  state.counters["eltcnt/sec"] =
      benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariantRate);
  const size_t BytesRead = 2 * sizeof(T) * Length;
  state.counters["bytes_read/iteration"] =
      benchmark::Counter(BytesRead, benchmark::Counter::kDefaults,
                         benchmark::Counter::OneK::kIs1024);
  state.counters["bytes_read/sec"] = benchmark::Counter(
      BytesRead, benchmark::Counter::kIsIterationInvariantRate,
      benchmark::Counter::OneK::kIs1024);
}

template <typename T>
static void CustomArguments(benchmark::internal::Benchmark* b) {
  const size_t L2SizeBytes = []() {
    for (const benchmark::CPUInfo::CacheInfo& I :
         benchmark::CPUInfo::Get().caches) {
      if (I.level == 2) return I.size;
    }
    return 0;
  }();
  // What is the largest range we can check to always fit within given L2 cache?
  const size_t MaxLen = L2SizeBytes / /*total bufs*/ 2 /
                        /*maximal elt size*/ sizeof(T) / /*safety margin*/ 2;
  b->RangeMultiplier(2)->Range(1, MaxLen)->Complexity(benchmark::oN);
}

BENCHMARK_TEMPLATE(BM_bcmp, uint8_t, Identical)
    ->Apply(CustomArguments<uint8_t>);
BENCHMARK_TEMPLATE(BM_bcmp, uint16_t, Identical)
    ->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_bcmp, uint32_t, Identical)
    ->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_bcmp, uint64_t, Identical)
    ->Apply(CustomArguments<uint64_t>);

BENCHMARK_TEMPLATE(BM_bcmp, uint8_t, InequalHalfway)
    ->Apply(CustomArguments<uint8_t>);
BENCHMARK_TEMPLATE(BM_bcmp, uint16_t, InequalHalfway)
    ->Apply(CustomArguments<uint16_t>);
BENCHMARK_TEMPLATE(BM_bcmp, uint32_t, InequalHalfway)
    ->Apply(CustomArguments<uint32_t>);
BENCHMARK_TEMPLATE(BM_bcmp, uint64_t, InequalHalfway)
    ->Apply(CustomArguments<uint64_t>);
```
{F8768210}
```
$ ~/src/googlebenchmark/tools/compare.py --no-utest benchmarks build-{old,new}/test/llvm-bcmp-bench
RUNNING: build-old/test/llvm-bcmp-bench --benchmark_out=/tmp/tmpb6PEUx
2019-04-25 21:17:11
Running build-old/test/llvm-bcmp-bench
Run on (8 X 4000 MHz CPU s)
CPU Caches:
  L1 Data 16K (x8)
  L1 Instruction 64K (x4)
  L2 Unified 2048K (x4)
  L3 Unified 8192K (x1)
Load Average: 0.65, 3.90, 4.14
---------------------------------------------------------------------------------------------------
Benchmark                                         Time             CPU   Iterations UserCounters...
---------------------------------------------------------------------------------------------------
<...>
BM_bcmp<uint8_t, Identical>/512000           432131 ns       432101 ns         1613 bytes_read/iteration=1000k bytes_read/sec=2.20706G/s eltcnt=825.856M eltcnt/sec=1.18491G/s
BM_bcmp<uint8_t, Identical>_BigO               0.86 N          0.86 N
BM_bcmp<uint8_t, Identical>_RMS                   8 %             8 %
<...>
BM_bcmp<uint16_t, Identical>/256000          161408 ns       161409 ns         4027 bytes_read/iteration=1000k bytes_read/sec=5.90843G/s eltcnt=1030.91M eltcnt/sec=1.58603G/s
BM_bcmp<uint16_t, Identical>_BigO              0.67 N          0.67 N
BM_bcmp<uint16_t, Identical>_RMS                 25 %            25 %
<...>
BM_bcmp<uint32_t, Identical>/128000           81497 ns        81488 ns         8415 bytes_read/iteration=1000k bytes_read/sec=11.7032G/s eltcnt=1077.12M eltcnt/sec=1.57078G/s
BM_bcmp<uint32_t, Identical>_BigO              0.71 N          0.71 N
BM_bcmp<uint32_t, Identical>_RMS                 42 %            42 %
<...>
BM_bcmp<uint64_t, Identical>/64000            50138 ns        50138 ns        10909 bytes_read/iteration=1000k bytes_read/sec=19.0209G/s eltcnt=698.176M eltcnt/sec=1.27647G/s
BM_bcmp<uint64_t, Identical>_BigO              0.84 N          0.84 N
BM_bcmp<uint64_t, Identical>_RMS                 27 %            27 %
<...>
BM_bcmp<uint8_t, InequalHalfway>/512000      192405 ns       192392 ns         3638 bytes_read/iteration=1000k bytes_read/sec=4.95694G/s eltcnt=1.86266G eltcnt/sec=2.66124G/s
BM_bcmp<uint8_t, InequalHalfway>_BigO          0.38 N          0.38 N
BM_bcmp<uint8_t, InequalHalfway>_RMS              3 %             3 %
<...>
BM_bcmp<uint16_t, InequalHalfway>/256000     127858 ns       127860 ns         5477 bytes_read/iteration=1000k bytes_read/sec=7.45873G/s eltcnt=1.40211G eltcnt/sec=2.00219G/s
BM_bcmp<uint16_t, InequalHalfway>_BigO         0.50 N          0.50 N
BM_bcmp<uint16_t, InequalHalfway>_RMS             0 %             0 %
<...>
BM_bcmp<uint32_t, InequalHalfway>/128000      49140 ns        49140 ns        14281 bytes_read/iteration=1000k bytes_read/sec=19.4072G/s eltcnt=1.82797G eltcnt/sec=2.60478G/s
BM_bcmp<uint32_t, InequalHalfway>_BigO         0.40 N          0.40 N
BM_bcmp<uint32_t, InequalHalfway>_RMS            18 %            18 %
<...>
BM_bcmp<uint64_t, InequalHalfway>/64000       32101 ns        32099 ns        21786 bytes_read/iteration=1000k bytes_read/sec=29.7101G/s eltcnt=1.3943G eltcnt/sec=1.99381G/s
BM_bcmp<uint64_t, InequalHalfway>_BigO         0.50 N          0.50 N
BM_bcmp<uint64_t, InequalHalfway>_RMS             1 %             1 %
RUNNING: build-new/test/llvm-bcmp-bench --benchmark_out=/tmp/tmpQ46PP0
2019-04-25 21:19:29
Running build-new/test/llvm-bcmp-bench
Run on (8 X 4000 MHz CPU s)
CPU Caches:
  L1 Data 16K (x8)
  L1 Instruction 64K (x4)
  L2 Unified 2048K (x4)
  L3 Unified 8192K (x1)
Load Average: 1.01, 2.85, 3.71
---------------------------------------------------------------------------------------------------
Benchmark                                         Time             CPU   Iterations UserCounters...
---------------------------------------------------------------------------------------------------
<...>
BM_bcmp<uint8_t, Identical>/512000            18593 ns        18590 ns        37565 bytes_read/iteration=1000k bytes_read/sec=51.2991G/s eltcnt=19.2333G eltcnt/sec=27.541G/s
BM_bcmp<uint8_t, Identical>_BigO               0.04 N          0.04 N
BM_bcmp<uint8_t, Identical>_RMS                  37 %            37 %
<...>
BM_bcmp<uint16_t, Identical>/256000           18950 ns        18948 ns        37223 bytes_read/iteration=1000k bytes_read/sec=50.3324G/s eltcnt=9.52909G eltcnt/sec=13.511G/s
BM_bcmp<uint16_t, Identical>_BigO              0.08 N          0.08 N
BM_bcmp<uint16_t, Identical>_RMS                 34 %            34 %
<...>
BM_bcmp<uint32_t, Identical>/128000           18627 ns        18627 ns        37895 bytes_read/iteration=1000k bytes_read/sec=51.198G/s eltcnt=4.85056G eltcnt/sec=6.87168G/s
BM_bcmp<uint32_t, Identical>_BigO              0.16 N          0.16 N
BM_bcmp<uint32_t, Identical>_RMS                 35 %            35 %
<...>
BM_bcmp<uint64_t, Identical>/64000            18855 ns        18855 ns        37458 bytes_read/iteration=1000k bytes_read/sec=50.5791G/s eltcnt=2.39731G eltcnt/sec=3.3943G/s
BM_bcmp<uint64_t, Identical>_BigO              0.32 N          0.32 N
BM_bcmp<uint64_t, Identical>_RMS                 33 %            33 %
<...>
BM_bcmp<uint8_t, InequalHalfway>/512000        9570 ns         9569 ns        73500 bytes_read/iteration=1000k bytes_read/sec=99.6601G/s eltcnt=37.632G eltcnt/sec=53.5046G/s
BM_bcmp<uint8_t, InequalHalfway>_BigO          0.02 N          0.02 N
BM_bcmp<uint8_t, InequalHalfway>_RMS             29 %            29 %
<...>
BM_bcmp<uint16_t, InequalHalfway>/256000       9547 ns         9547 ns        74343 bytes_read/iteration=1000k bytes_read/sec=99.8971G/s eltcnt=19.0318G eltcnt/sec=26.8159G/s
BM_bcmp<uint16_t, InequalHalfway>_BigO         0.04 N          0.04 N
BM_bcmp<uint16_t, InequalHalfway>_RMS            29 %            29 %
<...>
BM_bcmp<uint32_t, InequalHalfway>/128000       9396 ns         9394 ns        73521 bytes_read/iteration=1000k bytes_read/sec=101.518G/s eltcnt=9.41069G eltcnt/sec=13.6255G/s
BM_bcmp<uint32_t, InequalHalfway>_BigO         0.08 N          0.08 N
BM_bcmp<uint32_t, InequalHalfway>_RMS            30 %            30 %
<...>
BM_bcmp<uint64_t, InequalHalfway>/64000        9499 ns         9498 ns        73802 bytes_read/iteration=1000k bytes_read/sec=100.405G/s eltcnt=4.72333G eltcnt/sec=6.73808G/s
BM_bcmp<uint64_t, InequalHalfway>_BigO         0.16 N          0.16 N
BM_bcmp<uint64_t, InequalHalfway>_RMS            28 %            28 %
Comparing build-old/test/llvm-bcmp-bench to build-new/test/llvm-bcmp-bench
Benchmark                                                  Time             CPU      Time Old      Time New       CPU Old       CPU New
---------------------------------------------------------------------------------------------------------------------------------------
<...>
BM_bcmp<uint8_t, Identical>/512000                      -0.9570         -0.9570        432131         18593        432101         18590
<...>
BM_bcmp<uint16_t, Identical>/256000                     -0.8826         -0.8826        161408         18950        161409         18948
<...>
BM_bcmp<uint32_t, Identical>/128000                     -0.7714         -0.7714         81497         18627         81488         18627
<...>
BM_bcmp<uint64_t, Identical>/64000                      -0.6239         -0.6239         50138         18855         50138         18855
<...>
BM_bcmp<uint8_t, InequalHalfway>/512000                 -0.9503         -0.9503        192405          9570        192392          9569
<...>
BM_bcmp<uint16_t, InequalHalfway>/256000                -0.9253         -0.9253        127858          9547        127860          9547
<...>
BM_bcmp<uint32_t, InequalHalfway>/128000                -0.8088         -0.8088         49140          9396         49140          9394
<...>
BM_bcmp<uint64_t, InequalHalfway>/64000                 -0.7041         -0.7041         32101          9499         32099          9498
```

What can we tell from the benchmark?
* Performance of naive equality check somewhat improves with element size,
  maxing out at eltcnt/sec=1.58603G/s for uint16_t, or bytes_read/sec=19.0209G/s
  for uint64_t. I think, that instability implies performance problems.
* Performance of `memcmp()`-aware benchmark always maxes out at around
  bytes_read/sec=51.2991G/s for every type. That is 2.6x the throughput of the
  naive variant!
* eltcnt/sec metric for the `memcmp()`-aware benchmark maxes out at
  eltcnt/sec=27.541G/s for uint8_t (was: eltcnt/sec=1.18491G/s, so 24x) and
  linearly decreases with element size.
  For uint64_t, it's ~4x+ the elements/second.
* The call obvious is more pricey than the loop, with small element count.
  As it can be seen from the full output {F8768210}, the `memcmp()` is almost
  universally worse, independent of the element size (and thus buffer size) when
  element count is less than 8.

So all in all, bcmp idiom does indeed pose untapped performance headroom.
This diff does implement said idiom recognition. I think a reasonable test
coverage is present, but do tell if there is anything obvious missing.

Now, quality. This does succeed to build and pass the test-suite, at least
without any non-bundled elements. {F8768216} {F8768217}
This transform fires 91 times:
```
$ /build/test-suite/utils/compare.py -m loop-idiom.NumBCmp result-new.json
Tests: 1149
Metric: loop-idiom.NumBCmp

Program                                         result-new

MultiSourc...Benchmarks/7zip/7zip-benchmark    79.00
MultiSource/Applications/d/make_dparser         3.00
SingleSource/UnitTests/vla                      2.00
MultiSource/Applications/Burg/burg              1.00
MultiSourc.../Applications/JM/lencod/lencod     1.00
MultiSource/Applications/lemon/lemon            1.00
MultiSource/Benchmarks/Bullet/bullet            1.00
MultiSourc...e/Benchmarks/MallocBench/gs/gs     1.00
MultiSourc...gs-C/TimberWolfMC/timberwolfmc     1.00
MultiSourc...Prolangs-C/simulator/simulator     1.00
```
The size changes are:
I'm not sure what's going on with SingleSource/UnitTests/vla.test yet, did not look.
```
$ /build/test-suite/utils/compare.py -m size..text result-{old,new}.json --filter-hash
Tests: 1149
Same hash: 907 (filtered out)
Remaining: 242
Metric: size..text

Program                                        result-old result-new diff
test-suite...ingleSource/UnitTests/vla.test   753.00     833.00     10.6%
test-suite...marks/7zip/7zip-benchmark.test   1001697.00 966657.00  -3.5%
test-suite...ngs-C/simulator/simulator.test   32369.00   32321.00   -0.1%
test-suite...plications/d/make_dparser.test   89585.00   89505.00   -0.1%
test-suite...ce/Applications/Burg/burg.test   40817.00   40785.00   -0.1%
test-suite.../Applications/lemon/lemon.test   47281.00   47249.00   -0.1%
test-suite...TimberWolfMC/timberwolfmc.test   250065.00  250113.00   0.0%
test-suite...chmarks/MallocBench/gs/gs.test   149889.00  149873.00  -0.0%
test-suite...ications/JM/lencod/lencod.test   769585.00  769569.00  -0.0%
test-suite.../Benchmarks/Bullet/bullet.test   770049.00  770049.00   0.0%
test-suite...HMARK_ANISTROPIC_DIFFUSION/128    NaN        NaN        nan%
test-suite...HMARK_ANISTROPIC_DIFFUSION/256    NaN        NaN        nan%
test-suite...CHMARK_ANISTROPIC_DIFFUSION/64    NaN        NaN        nan%
test-suite...CHMARK_ANISTROPIC_DIFFUSION/32    NaN        NaN        nan%
test-suite...ENCHMARK_BILATERAL_FILTER/64/4    NaN        NaN        nan%
Geomean difference                                                   nan%
         result-old    result-new       diff
count  1.000000e+01  10.00000      10.000000
mean   3.152090e+05  311695.40000  0.006749
std    3.790398e+05  372091.42232  0.036605
min    7.530000e+02  833.00000    -0.034981
25%    4.243300e+04  42401.00000  -0.000866
50%    1.197370e+05  119689.00000 -0.000392
75%    6.397050e+05  639705.00000 -0.000005
max    1.001697e+06  966657.00000  0.106242
```

I don't have timings though.

And now to the code. The basic idea is to completely replace the whole loop.
If we can't fully kill it, don't transform.
I have left one or two comments in the code, so hopefully it can be understood.

Also, there is a few TODO's that i have left for follow-ups:
* widening of `memcmp()`/`bcmp()`
* step smaller than the comparison size
* Metadata propagation
* more than two blocks as long as there is still a single backedge?
* ???

Reviewers: reames, fhahn, mkazantsev, chandlerc, craig.topper, courbet

Reviewed By: courbet

Subscribers: miyuki, hiraditya, xbolva00, nikic, jfb, gchatelet, courbet, llvm-commits, mclow.lists

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D61144

llvm-svn: 374662
This commit is contained in:
Roman Lebedev 2019-10-12 15:35:32 +00:00
parent 7d56790f6f
commit 0dff68630e
5 changed files with 1280 additions and 603 deletions

View File

@ -66,6 +66,9 @@ Non-comprehensive list of changes in this release
Undefined Behaviour Sanitizer ``-fsanitize=pointer-overflow`` check
will now catch such cases.
* The Loop Idiom Recognition (``-loop-idiom``) pass has learned to recognize
``bcmp`` pattern, and convert it into a call to ``bcmp`` (or ``memcmp``)
function.
Changes to the LLVM IR
----------------------

View File

@ -41,6 +41,7 @@
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
@ -77,16 +78,20 @@
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/BuildLibCalls.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
@ -102,6 +107,7 @@ using namespace llvm;
STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
STATISTIC(NumBCmp, "Number of memcmp's formed from loop 2xload+eq-compare");
static cl::opt<bool> UseLIRCodeSizeHeurs(
"use-lir-code-size-heurs",
@ -111,6 +117,26 @@ static cl::opt<bool> UseLIRCodeSizeHeurs(
namespace {
// FIXME: reinventing the wheel much? Is there a cleaner solution?
struct PMAbstraction {
virtual void markLoopAsDeleted(Loop *L) = 0;
virtual ~PMAbstraction() = default;
};
struct LegacyPMAbstraction : PMAbstraction {
LPPassManager &LPM;
LegacyPMAbstraction(LPPassManager &LPM) : LPM(LPM) {}
virtual ~LegacyPMAbstraction() = default;
void markLoopAsDeleted(Loop *L) override { LPM.markLoopAsDeleted(*L); }
};
struct NewPMAbstraction : PMAbstraction {
LPMUpdater &Updater;
NewPMAbstraction(LPMUpdater &Updater) : Updater(Updater) {}
virtual ~NewPMAbstraction() = default;
void markLoopAsDeleted(Loop *L) override {
Updater.markLoopAsDeleted(*L, L->getName());
}
};
class LoopIdiomRecognize {
Loop *CurLoop = nullptr;
AliasAnalysis *AA;
@ -120,6 +146,7 @@ class LoopIdiomRecognize {
TargetLibraryInfo *TLI;
const TargetTransformInfo *TTI;
const DataLayout *DL;
PMAbstraction &LoopDeleter;
OptimizationRemarkEmitter &ORE;
bool ApplyCodeSizeHeuristics;
@ -128,9 +155,10 @@ public:
LoopInfo *LI, ScalarEvolution *SE,
TargetLibraryInfo *TLI,
const TargetTransformInfo *TTI,
const DataLayout *DL,
const DataLayout *DL, PMAbstraction &LoopDeleter,
OptimizationRemarkEmitter &ORE)
: AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {}
: AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL),
LoopDeleter(LoopDeleter), ORE(ORE) {}
bool runOnLoop(Loop *L);
@ -144,6 +172,8 @@ private:
bool HasMemset;
bool HasMemsetPattern;
bool HasMemcpy;
bool HasMemCmp;
bool HasBCmp;
/// Return code for isLegalStore()
enum LegalStoreKind {
@ -186,6 +216,32 @@ private:
bool runOnNoncountableLoop();
struct CmpLoopStructure {
Value *BCmpValue, *LatchCmpValue;
BasicBlock *HeaderBrEqualBB, *HeaderBrUnequalBB;
BasicBlock *LatchBrFinishBB, *LatchBrContinueBB;
};
bool matchBCmpLoopStructure(CmpLoopStructure &CmpLoop) const;
struct CmpOfLoads {
ICmpInst::Predicate BCmpPred;
Value *LoadSrcA, *LoadSrcB;
Value *LoadA, *LoadB;
};
bool matchBCmpOfLoads(Value *BCmpValue, CmpOfLoads &CmpOfLoads) const;
bool recognizeBCmpLoopControlFlow(const CmpOfLoads &CmpOfLoads,
CmpLoopStructure &CmpLoop) const;
bool recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes, CmpOfLoads &CmpOfLoads,
const SCEV *&SrcA, const SCEV *&SrcB,
const SCEV *&Iterations) const;
bool detectBCmpIdiom(ICmpInst *&BCmpInst, CmpInst *&LatchCmpInst,
LoadInst *&LoadA, LoadInst *&LoadB, const SCEV *&SrcA,
const SCEV *&SrcB, const SCEV *&NBytes) const;
BasicBlock *transformBCmpControlFlow(ICmpInst *ComparedEqual);
void transformLoopToBCmp(ICmpInst *BCmpInst, CmpInst *LatchCmpInst,
LoadInst *LoadA, LoadInst *LoadB, const SCEV *SrcA,
const SCEV *SrcB, const SCEV *NBytes);
bool recognizeBCmp();
bool recognizePopcount();
void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
PHINode *CntPhi, Value *Var);
@ -223,13 +279,14 @@ public:
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
*L->getHeader()->getParent());
const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
LegacyPMAbstraction LoopDeleter(LPM);
// For the old PM, we can't use OptimizationRemarkEmitter as an analysis
// pass. Function analyses need to be preserved across loop transformations
// but ORE cannot be preserved (see comment before the pass definition).
OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, ORE);
LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, LoopDeleter, ORE);
return LIR.runOnLoop(L);
}
@ -248,7 +305,7 @@ char LoopIdiomRecognizeLegacyPass::ID = 0;
PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
LoopStandardAnalysisResults &AR,
LPMUpdater &) {
LPMUpdater &Updater) {
const auto *DL = &L.getHeader()->getModule()->getDataLayout();
const auto &FAM =
@ -262,8 +319,9 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
"LoopIdiomRecognizePass: OptimizationRemarkEmitterAnalysis not cached "
"at a higher level");
NewPMAbstraction LoopDeleter(Updater);
LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL,
*ORE);
LoopDeleter, *ORE);
if (!LIR.runOnLoop(&L))
return PreservedAnalyses::all();
@ -300,7 +358,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
// Disable loop idiom recognition if the function's name is a common idiom.
StringRef Name = L->getHeader()->getParent()->getName();
if (Name == "memset" || Name == "memcpy")
if (Name == "memset" || Name == "memcpy" || Name == "memcmp" ||
Name == "bcmp")
return false;
// Determine if code size heuristics need to be applied.
@ -310,8 +369,10 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
HasMemset = TLI->has(LibFunc_memset);
HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
HasMemcpy = TLI->has(LibFunc_memcpy);
HasMemCmp = TLI->has(LibFunc_memcmp);
HasBCmp = TLI->has(LibFunc_bcmp);
if (HasMemset || HasMemsetPattern || HasMemcpy)
if (HasMemset || HasMemsetPattern || HasMemcpy || HasMemCmp || HasBCmp)
if (SE->hasLoopInvariantBackedgeTakenCount(L))
return runOnCountableLoop();
@ -1150,7 +1211,7 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
<< "] Noncountable Loop %"
<< CurLoop->getHeader()->getName() << "\n");
return recognizePopcount() || recognizeAndInsertFFS();
return recognizeBCmp() || recognizePopcount() || recognizeAndInsertFFS();
}
/// Check if the given conditional branch is based on the comparison between
@ -1824,3 +1885,804 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
// loop. The loop would otherwise not be deleted even if it becomes empty.
SE->forgetLoop(CurLoop);
}
bool LoopIdiomRecognize::matchBCmpLoopStructure(
CmpLoopStructure &CmpLoop) const {
ICmpInst::Predicate BCmpPred;
// We are looking for the following basic layout:
// PreheaderBB: <preheader> ; preds = ???
// <...>
// br label %LoopHeaderBB
// LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB
// <...>
// %BCmpValue = icmp <...>
// br i1 %BCmpValue, label %LoopLatchBB, label %Successor0
// LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
// <...>
// %LatchCmpValue = <are we done, or do next iteration?>
// br i1 %LatchCmpValue, label %Successor1, label %LoopHeaderBB
// Successor0: <exit> ; preds = %LoopHeaderBB
// <...>
// Successor1: <exit> ; preds = %LoopLatchBB
// <...>
//
// Successor0 and Successor1 may or may not be the same basic block.
// Match basic frame-work of this supposedly-comparison loop.
using namespace PatternMatch;
if (!match(CurLoop->getHeader()->getTerminator(),
m_Br(m_CombineAnd(m_ICmp(BCmpPred, m_Value(), m_Value()),
m_Value(CmpLoop.BCmpValue)),
CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB)) ||
!match(CurLoop->getLoopLatch()->getTerminator(),
m_Br(m_CombineAnd(m_Cmp(), m_Value(CmpLoop.LatchCmpValue)),
CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB))) {
LLVM_DEBUG(dbgs() << "Basic control-flow layout unrecognized.\n");
return false;
}
LLVM_DEBUG(dbgs() << "Recognized basic control-flow layout.\n");
return true;
}
bool LoopIdiomRecognize::matchBCmpOfLoads(Value *BCmpValue,
CmpOfLoads &CmpOfLoads) const {
using namespace PatternMatch;
LLVM_DEBUG(dbgs() << "Analyzing header icmp " << *BCmpValue
<< " as bcmp pattern.\n");
// Match bcmp-style loop header cmp. It must be an eq-icmp of loads. Example:
// %v0 = load <...>, <...>* %LoadSrcA
// %v1 = load <...>, <...>* %LoadSrcB
// %CmpLoop.BCmpValue = icmp eq <...> %v0, %v1
// There won't be any no-op bitcasts between load and icmp,
// they would have been transformed into a load of bitcast.
// FIXME: {b,mem}cmp() calls have the same semantics as icmp. Match them too.
if (!match(BCmpValue,
m_ICmp(CmpOfLoads.BCmpPred,
m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcA)),
m_Value(CmpOfLoads.LoadA)),
m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcB)),
m_Value(CmpOfLoads.LoadB)))) ||
!ICmpInst::isEquality(CmpOfLoads.BCmpPred)) {
LLVM_DEBUG(dbgs() << "Loop header icmp did not match bcmp pattern.\n");
return false;
}
LLVM_DEBUG(dbgs() << "Recognized header icmp as bcmp pattern with loads:\n\t"
<< *CmpOfLoads.LoadA << "\n\t" << *CmpOfLoads.LoadB
<< "\n");
// FIXME: handle memcmp pattern?
return true;
}
bool LoopIdiomRecognize::recognizeBCmpLoopControlFlow(
const CmpOfLoads &CmpOfLoads, CmpLoopStructure &CmpLoop) const {
BasicBlock *LoopHeaderBB = CurLoop->getHeader();
BasicBlock *LoopLatchBB = CurLoop->getLoopLatch();
// Be wary, comparisons can be inverted, canonicalize order.
// If this 'element' comparison passed, we expect to proceed to the next elt.
if (CmpOfLoads.BCmpPred != ICmpInst::Predicate::ICMP_EQ)
std::swap(CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB);
// The predicate on loop latch does not matter, just canonicalize some order.
if (CmpLoop.LatchBrContinueBB != LoopHeaderBB)
std::swap(CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB);
// Check that control-flow between blocks is as expected.
if (CmpLoop.HeaderBrEqualBB != LoopLatchBB ||
CmpLoop.LatchBrContinueBB != LoopHeaderBB) {
LLVM_DEBUG(dbgs() << "Loop control-flow not recognized.\n");
return false;
}
SmallVector<BasicBlock *, 2> ExitBlocks;
CurLoop->getUniqueExitBlocks(ExitBlocks);
assert(ExitBlocks.size() <= 2U && "Can't have more than two exit blocks.");
assert(!is_contained(ExitBlocks, CmpLoop.HeaderBrEqualBB) &&
is_contained(ExitBlocks, CmpLoop.HeaderBrUnequalBB) &&
!is_contained(ExitBlocks, CmpLoop.LatchBrContinueBB) &&
is_contained(ExitBlocks, CmpLoop.LatchBrFinishBB) &&
"Unexpected exit edges.");
LLVM_DEBUG(dbgs() << "Recognized loop control-flow.\n");
LLVM_DEBUG(dbgs() << "Performing side-effect analysis on the loop.\n");
assert(CurLoop->isLCSSAForm(*DT) && "Should only get LCSSA-form loops here.");
// No loop instructions must be used outside of the loop. Since we are in
// LCSSA form, we only need to check successor block's PHI nodes's incoming
// values for incoming blocks that are the loop basic blocks.
for (const BasicBlock *ExitBB : ExitBlocks) {
for (const PHINode &PHI : ExitBB->phis()) {
for (const BasicBlock *LoopBB :
make_filter_range(PHI.blocks(), [this](BasicBlock *PredecessorBB) {
return CurLoop->contains(PredecessorBB);
})) {
const auto *I =
dyn_cast<Instruction>(PHI.getIncomingValueForBlock(LoopBB));
if (I && CurLoop->contains(I)) {
LLVM_DEBUG(dbgs()
<< "Loop contains instruction " << *I
<< " which is used outside of the loop in basic block "
<< ExitBB->getName() << " in phi node " << PHI << "\n");
return false;
}
}
}
}
// Similarly, the loop should not have any other observable side-effects
// other than the final comparison result.
for (BasicBlock *LoopBB : CurLoop->blocks()) {
for (Instruction &I : *LoopBB) {
if (isa<DbgInfoIntrinsic>(I)) // Ignore dbginfo.
continue; // FIXME: anything else? lifetime info?
if ((I.mayHaveSideEffects() || I.isAtomic() || I.isFenceLike()) &&
&I != CmpOfLoads.LoadA && &I != CmpOfLoads.LoadB) {
LLVM_DEBUG(
dbgs() << "Loop contains instruction with potential side-effects: "
<< I << "\n");
return false;
}
}
}
LLVM_DEBUG(dbgs() << "No loop instructions deemed to have side-effects.\n");
return true;
}
bool LoopIdiomRecognize::recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes,
CmpOfLoads &CmpOfLoads,
const SCEV *&SrcA,
const SCEV *&SrcB,
const SCEV *&Iterations) const {
// Try to compute SCEV of the loads, for this loop's scope.
const auto *ScevForSrcA = dyn_cast<SCEVAddRecExpr>(
SE->getSCEVAtScope(CmpOfLoads.LoadSrcA, CurLoop));
const auto *ScevForSrcB = dyn_cast<SCEVAddRecExpr>(
SE->getSCEVAtScope(CmpOfLoads.LoadSrcB, CurLoop));
if (!ScevForSrcA || !ScevForSrcB) {
LLVM_DEBUG(dbgs() << "Failed to get SCEV expressions for load sources.\n");
return false;
}
LLVM_DEBUG(dbgs() << "Got SCEV expressions (at loop scope) for loads:\n\t"
<< *ScevForSrcA << "\n\t" << *ScevForSrcB << "\n");
// Loads must have folloving SCEV exprs: {%ptr,+,BCmpTyBytes}<%LoopHeaderBB>
const SCEV *RecStepForA = ScevForSrcA->getStepRecurrence(*SE);
const SCEV *RecStepForB = ScevForSrcB->getStepRecurrence(*SE);
if (!ScevForSrcA->isAffine() || !ScevForSrcB->isAffine() ||
ScevForSrcA->getLoop() != CurLoop || ScevForSrcB->getLoop() != CurLoop ||
RecStepForA != RecStepForB || !isa<SCEVConstant>(RecStepForA) ||
cast<SCEVConstant>(RecStepForA)->getAPInt() != BCmpTyBytes) {
LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads. Only support "
"affine SCEV expressions originating in the loop we "
"are analysing with identical constant positive step, "
"equal to the count of bytes compared. Got:\n\t"
<< *RecStepForA << "\n\t" << *RecStepForB << "\n");
return false;
// FIXME: can support BCmpTyBytes > Step.
// But will need to account for the extra bytes compared at the end.
}
SrcA = ScevForSrcA->getStart();
SrcB = ScevForSrcB->getStart();
LLVM_DEBUG(dbgs() << "Got SCEV expressions for load sources:\n\t" << *SrcA
<< "\n\t" << *SrcB << "\n");
// The load sources must be loop-invants that dominate the loop header.
if (SrcA == SE->getCouldNotCompute() || SrcB == SE->getCouldNotCompute() ||
!SE->isAvailableAtLoopEntry(SrcA, CurLoop) ||
!SE->isAvailableAtLoopEntry(SrcB, CurLoop)) {
LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads, unavaliable "
"prior to loop header.\n");
return false;
}
LLVM_DEBUG(dbgs() << "SCEV expressions for loads are acceptable.\n");
// For how many iterations is loop guaranteed not to exit via LoopLatch?
// This is one less than the maximal number of comparisons,and is: n + -1
const SCEV *LoopExitCount =
SE->getExitCount(CurLoop, CurLoop->getLoopLatch());
LLVM_DEBUG(dbgs() << "Got SCEV expression for loop latch exit count: "
<< *LoopExitCount << "\n");
// Exit count, similarly, must be loop-invant that dominates the loop header.
if (LoopExitCount == SE->getCouldNotCompute() ||
!LoopExitCount->getType()->isIntOrPtrTy() ||
!SE->isAvailableAtLoopEntry(LoopExitCount, CurLoop)) {
LLVM_DEBUG(dbgs() << "Unsupported SCEV expression for loop latch exit.\n");
return false;
}
// LoopExitCount is always one less than the actual count of iterations.
// Do this before cast, else we will be stuck with 1 + zext(-1 + n)
Iterations = SE->getAddExpr(
LoopExitCount, SE->getOne(LoopExitCount->getType()), SCEV::FlagNUW);
assert(Iterations != SE->getCouldNotCompute() &&
"Shouldn't fail to increment by one.");
LLVM_DEBUG(dbgs() << "Computed iteration count: " << *Iterations << "\n");
return true;
}
/// Return true iff the bcmp idiom is detected in the loop.
///
/// Additionally:
/// 1) \p BCmpInst is set to the root byte-comparison instruction.
/// 2) \p LatchCmpInst is set to the comparison that controls the latch.
/// 3) \p LoadA is set to the first LoadInst.
/// 4) \p LoadB is set to the second LoadInst.
/// 5) \p SrcA is set to the first source location that is being compared.
/// 6) \p SrcB is set to the second source location that is being compared.
/// 7) \p NBytes is set to the number of bytes to compare.
bool LoopIdiomRecognize::detectBCmpIdiom(ICmpInst *&BCmpInst,
CmpInst *&LatchCmpInst,
LoadInst *&LoadA, LoadInst *&LoadB,
const SCEV *&SrcA, const SCEV *&SrcB,
const SCEV *&NBytes) const {
LLVM_DEBUG(dbgs() << "Recognizing bcmp idiom\n");
// Give up if the loop is not in normal form, or has more than 2 blocks.
if (!CurLoop->isLoopSimplifyForm() || CurLoop->getNumBlocks() > 2) {
LLVM_DEBUG(dbgs() << "Basic loop structure unrecognized.\n");
return false;
}
LLVM_DEBUG(dbgs() << "Recognized basic loop structure.\n");
CmpLoopStructure CmpLoop;
if (!matchBCmpLoopStructure(CmpLoop))
return false;
CmpOfLoads CmpOfLoads;
if (!matchBCmpOfLoads(CmpLoop.BCmpValue, CmpOfLoads))
return false;
if (!recognizeBCmpLoopControlFlow(CmpOfLoads, CmpLoop))
return false;
BCmpInst = cast<ICmpInst>(CmpLoop.BCmpValue); // FIXME: is there no
LatchCmpInst = cast<CmpInst>(CmpLoop.LatchCmpValue); // way to combine
LoadA = cast<LoadInst>(CmpOfLoads.LoadA); // these cast with
LoadB = cast<LoadInst>(CmpOfLoads.LoadB); // m_Value() matcher?
Type *BCmpValTy = BCmpInst->getOperand(0)->getType();
LLVMContext &Context = BCmpValTy->getContext();
uint64_t BCmpTyBits = DL->getTypeSizeInBits(BCmpValTy);
static constexpr uint64_t ByteTyBits = 8;
LLVM_DEBUG(dbgs() << "Got comparison between values of type " << *BCmpValTy
<< " of size " << BCmpTyBits
<< " bits (while byte = " << ByteTyBits << " bits).\n");
// bcmp()/memcmp() minimal unit of work is a byte. Therefore we must check
// that we are dealing with a multiple of a byte here.
if (BCmpTyBits % ByteTyBits != 0) {
LLVM_DEBUG(dbgs() << "Value size is not a multiple of byte.\n");
return false;
// FIXME: could still be done under a run-time check that the total bit
// count is a multiple of a byte i guess? Or handle remainder separately?
}
// Each comparison is done on this many bytes.
uint64_t BCmpTyBytes = BCmpTyBits / ByteTyBits;
LLVM_DEBUG(dbgs() << "Size is exactly " << BCmpTyBytes
<< " bytes, eligible for bcmp conversion.\n");
const SCEV *Iterations;
if (!recognizeBCmpLoopSCEV(BCmpTyBytes, CmpOfLoads, SrcA, SrcB, Iterations))
return false;
// bcmp / memcmp take length argument as size_t, do promotion now.
Type *CmpFuncSizeTy = DL->getIntPtrType(Context);
Iterations = SE->getNoopOrZeroExtend(Iterations, CmpFuncSizeTy);
assert(Iterations != SE->getCouldNotCompute() && "Promotion failed.");
// Note that it didn't do ptrtoint cast, we will need to do it manually.
// We will be comparing *bytes*, not BCmpTy, we need to recalculate size.
// It's a multiplication, and it *could* overflow. But for it to overflow
// we'd want to compare more bytes than could be represented by size_t, But
// allocation functions also take size_t. So how'd you produce such buffer?
// FIXME: we likely need to actually check that we know this won't overflow,
// via llvm::computeOverflowForUnsignedMul().
NBytes = SE->getMulExpr(
Iterations, SE->getConstant(CmpFuncSizeTy, BCmpTyBytes), SCEV::FlagNUW);
assert(NBytes != SE->getCouldNotCompute() &&
"Shouldn't fail to increment by one.");
LLVM_DEBUG(dbgs() << "Computed total byte count: " << *NBytes << "\n");
if (LoadA->getPointerAddressSpace() != LoadB->getPointerAddressSpace() ||
LoadA->getPointerAddressSpace() != 0 || !LoadA->isSimple() ||
!LoadB->isSimple()) {
StringLiteral L("Unsupported loads in idiom - only support identical, "
"simple loads from address space 0.\n");
LLVM_DEBUG(dbgs() << L);
ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "BCmpIdiomUnsupportedLoads",
BCmpInst->getDebugLoc(),
CurLoop->getHeader())
<< L;
});
return false; // FIXME
}
LLVM_DEBUG(dbgs() << "Recognized bcmp idiom\n");
ORE.emit([&]() {
return OptimizationRemarkAnalysis(DEBUG_TYPE, "RecognizedBCmpIdiom",
CurLoop->getStartLoc(),
CurLoop->getHeader())
<< "Loop recognized as a bcmp idiom";
});
return true;
}
BasicBlock *
LoopIdiomRecognize::transformBCmpControlFlow(ICmpInst *ComparedEqual) {
LLVM_DEBUG(dbgs() << "Transforming control-flow.\n");
SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
BasicBlock *PreheaderBB = CurLoop->getLoopPreheader();
BasicBlock *HeaderBB = CurLoop->getHeader();
BasicBlock *LoopLatchBB = CurLoop->getLoopLatch();
SmallString<32> LoopName = CurLoop->getName();
Function *Func = PreheaderBB->getParent();
LLVMContext &Context = Func->getContext();
// Before doing anything, drop SCEV info.
SE->forgetLoop(CurLoop);
// Here we start with: (0/6)
// PreheaderBB: <preheader> ; preds = ???
// <...>
// %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
// %ComparedEqual = icmp eq <...> %memcmp, 0
// br label %LoopHeaderBB
// LoopHeaderBB: <header,exiting> ; preds = %PreheaderBB,%LoopLatchBB
// <...>
// br i1 %<...>, label %LoopLatchBB, label %Successor0BB
// LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
// <...>
// br i1 %<...>, label %Successor1BB, label %LoopHeaderBB
// Successor0BB: <exit> ; preds = %LoopHeaderBB
// %S0PHI = phi <...> [ <...>, %LoopHeaderBB ]
// <...>
// Successor1BB: <exit> ; preds = %LoopLatchBB
// %S1PHI = phi <...> [ <...>, %LoopLatchBB ]
// <...>
//
// Successor0 and Successor1 may or may not be the same basic block.
// Decouple the edge between loop preheader basic block and loop header basic
// block. Thus the loop has become unreachable.
assert(cast<BranchInst>(PreheaderBB->getTerminator())->isUnconditional() &&
PreheaderBB->getTerminator()->getSuccessor(0) == HeaderBB &&
"Preheader bb must end with an unconditional branch to header bb.");
PreheaderBB->getTerminator()->eraseFromParent();
DTUpdates.push_back({DominatorTree::Delete, PreheaderBB, HeaderBB});
// Create a new preheader basic block before loop header basic block.
auto *PhonyPreheaderBB = BasicBlock::Create(
Context, LoopName + ".phonypreheaderbb", Func, HeaderBB);
// And insert an unconditional branch from phony preheader basic block to
// loop header basic block.
IRBuilder<>(PhonyPreheaderBB).CreateBr(HeaderBB);
DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB});
// Create a *single* new empty block that we will substitute as a
// successor basic block for the loop's exits. This one is temporary.
// Much like phony preheader basic block, it is not connected.
auto *PhonySuccessorBB =
BasicBlock::Create(Context, LoopName + ".phonysuccessorbb", Func,
LoopLatchBB->getNextNode());
// That block must have *some* non-PHI instruction, or else deleteDeadLoop()
// will mess up cleanup of dbginfo, and verifier will complain.
IRBuilder<>(PhonySuccessorBB).CreateUnreachable();
// Create two new empty blocks that we will use to preserve the original
// loop exit control-flow, and preserve the incoming values in the PHI nodes
// in loop's successor exit blocks. These will live one.
auto *ComparedUnequalBB =
BasicBlock::Create(Context, ComparedEqual->getName() + ".unequalbb", Func,
PhonySuccessorBB->getNextNode());
auto *ComparedEqualBB =
BasicBlock::Create(Context, ComparedEqual->getName() + ".equalbb", Func,
PhonySuccessorBB->getNextNode());
// By now we have: (1/6)
// PreheaderBB: ; preds = ???
// <...>
// %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
// %ComparedEqual = icmp eq <...> %memcmp, 0
// [no terminator instruction!]
// PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE!
// br label %LoopHeaderBB
// LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB
// <...>
// br i1 %<...>, label %LoopLatchBB, label %Successor0BB
// LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
// <...>
// br i1 %<...>, label %Successor1BB, label %LoopHeaderBB
// PhonySuccessorBB: ; No preds, UNREACHABLE!
// unreachable
// EqualBB: ; No preds, UNREACHABLE!
// [no terminator instruction!]
// UnequalBB: ; No preds, UNREACHABLE!
// [no terminator instruction!]
// Successor0BB: <exit> ; preds = %LoopHeaderBB
// %S0PHI = phi <...> [ <...>, %LoopHeaderBB ]
// <...>
// Successor1BB: <exit> ; preds = %LoopLatchBB
// %S1PHI = phi <...> [ <...>, %LoopLatchBB ]
// <...>
// What is the mapping/replacement basic block for exiting out of the loop
// from either of old's loop basic blocks?
auto GetReplacementBB = [this, ComparedEqualBB,
ComparedUnequalBB](const BasicBlock *OldBB) {
assert(CurLoop->contains(OldBB) && "Only for loop's basic blocks.");
if (OldBB == CurLoop->getLoopLatch()) // "all elements compared equal".
return ComparedEqualBB;
if (OldBB == CurLoop->getHeader()) // "element compared unequal".
return ComparedUnequalBB;
llvm_unreachable("Only had two basic blocks in loop.");
};
// What are the exits out of this loop?
SmallVector<Loop::Edge, 2> LoopExitEdges;
CurLoop->getExitEdges(LoopExitEdges);
assert(LoopExitEdges.size() == 2 && "Should have only to two exit edges.");
// Populate new basic blocks, update the exiting control-flow, PHI nodes.
for (const Loop::Edge &Edge : LoopExitEdges) {
auto *OldLoopBB = const_cast<BasicBlock *>(Edge.first);
auto *SuccessorBB = const_cast<BasicBlock *>(Edge.second);
assert(CurLoop->contains(OldLoopBB) && !CurLoop->contains(SuccessorBB) &&
"Unexpected edge.");
// If we would exit the loop from this loop's basic block,
// what semantically would that mean? Did comparison succeed or fail?
BasicBlock *NewBB = GetReplacementBB(OldLoopBB);
assert(NewBB->empty() && "Should not get same new basic block here twice.");
IRBuilder<> Builder(NewBB);
Builder.SetCurrentDebugLocation(OldLoopBB->getTerminator()->getDebugLoc());
Builder.CreateBr(SuccessorBB);
DTUpdates.push_back({DominatorTree::Insert, NewBB, SuccessorBB});
// Also, be *REALLY* careful with PHI nodes in successor basic block,
// update them to recieve the same input value, but not from current loop's
// basic block, but from new basic block instead.
SuccessorBB->replacePhiUsesWith(OldLoopBB, NewBB);
// Also, change loop control-flow. This loop's basic block shall no longer
// exit from the loop to it's original successor basic block, but to our new
// phony successor basic block. Note that new successor will be unique exit.
OldLoopBB->getTerminator()->replaceSuccessorWith(SuccessorBB,
PhonySuccessorBB);
DTUpdates.push_back({DominatorTree::Delete, OldLoopBB, SuccessorBB});
DTUpdates.push_back({DominatorTree::Insert, OldLoopBB, PhonySuccessorBB});
}
// Inform DomTree about edge changes. Note that LoopInfo is still out-of-date.
assert(DTUpdates.size() == 8 && "Update count prediction failed.");
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
DTU.applyUpdates(DTUpdates);
DTUpdates.clear();
// By now we have: (2/6)
// PreheaderBB: ; preds = ???
// <...>
// %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
// %ComparedEqual = icmp eq <...> %memcmp, 0
// [no terminator instruction!]
// PhonyPreheaderBB: <preheader> ; No preds, UNREACHABLE!
// br label %LoopHeaderBB
// LoopHeaderBB: <header,exiting> ; preds = %PhonyPreheaderBB, %LoopLatchBB
// <...>
// br i1 %<...>, label %LoopLatchBB, label %PhonySuccessorBB
// LoopLatchBB: <latch,exiting> ; preds = %LoopHeaderBB
// <...>
// br i1 %<...>, label %PhonySuccessorBB, label %LoopHeaderBB
// PhonySuccessorBB: <uniq. exit> ; preds = %LoopHeaderBB, %LoopLatchBB
// unreachable
// EqualBB: ; No preds, UNREACHABLE!
// br label %Successor1BB
// UnequalBB: ; No preds, UNREACHABLE!
// br label %Successor0BB
// Successor0BB: ; preds = %UnequalBB
// %S0PHI = phi <...> [ <...>, %UnequalBB ]
// <...>
// Successor1BB: ; preds = %EqualBB
// %S0PHI = phi <...> [ <...>, %EqualBB ]
// <...>
// *Finally*, zap the original loop. Record it's parent loop though.
Loop *ParentLoop = CurLoop->getParentLoop();
LLVM_DEBUG(dbgs() << "Deleting old loop.\n");
LoopDeleter.markLoopAsDeleted(CurLoop); // Mark as deleted *BEFORE* deleting!
deleteDeadLoop(CurLoop, DT, SE, LI); // And actually delete the loop.
CurLoop = nullptr;
// By now we have: (3/6)
// PreheaderBB: ; preds = ???
// <...>
// %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
// %ComparedEqual = icmp eq <...> %memcmp, 0
// [no terminator instruction!]
// PhonyPreheaderBB: ; No preds, UNREACHABLE!
// br label %PhonySuccessorBB
// PhonySuccessorBB: ; preds = %PhonyPreheaderBB
// unreachable
// EqualBB: ; No preds, UNREACHABLE!
// br label %Successor1BB
// UnequalBB: ; No preds, UNREACHABLE!
// br label %Successor0BB
// Successor0BB: ; preds = %UnequalBB
// %S0PHI = phi <...> [ <...>, %UnequalBB ]
// <...>
// Successor1BB: ; preds = %EqualBB
// %S0PHI = phi <...> [ <...>, %EqualBB ]
// <...>
// Now, actually restore the CFG.
// Insert an unconditional branch from an actual preheader basic block to
// phony preheader basic block.
IRBuilder<>(PreheaderBB).CreateBr(PhonyPreheaderBB);
DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB});
// Insert proper conditional branch from phony successor basic block to the
// "dispatch" basic blocks, which were used to preserve incoming values in
// original loop's successor basic blocks.
assert(isa<UnreachableInst>(PhonySuccessorBB->getTerminator()) &&
"Yep, that's the one we created to keep deleteDeadLoop() happy.");
PhonySuccessorBB->getTerminator()->eraseFromParent();
{
IRBuilder<> Builder(PhonySuccessorBB);
Builder.SetCurrentDebugLocation(ComparedEqual->getDebugLoc());
Builder.CreateCondBr(ComparedEqual, ComparedEqualBB, ComparedUnequalBB);
}
DTUpdates.push_back(
{DominatorTree::Insert, PhonySuccessorBB, ComparedEqualBB});
DTUpdates.push_back(
{DominatorTree::Insert, PhonySuccessorBB, ComparedUnequalBB});
BasicBlock *DispatchBB = PhonySuccessorBB;
DispatchBB->setName(LoopName + ".bcmpdispatchbb");
assert(DTUpdates.size() == 3 && "Update count prediction failed.");
DTU.applyUpdates(DTUpdates);
DTUpdates.clear();
// By now we have: (4/6)
// PreheaderBB: ; preds = ???
// <...>
// %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
// %ComparedEqual = icmp eq <...> %memcmp, 0
// br label %PhonyPreheaderBB
// PhonyPreheaderBB: ; preds = %PreheaderBB
// br label %DispatchBB
// DispatchBB: ; preds = %PhonyPreheaderBB
// br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
// EqualBB: ; preds = %DispatchBB
// br label %Successor1BB
// UnequalBB: ; preds = %DispatchBB
// br label %Successor0BB
// Successor0BB: ; preds = %UnequalBB
// %S0PHI = phi <...> [ <...>, %UnequalBB ]
// <...>
// Successor1BB: ; preds = %EqualBB
// %S0PHI = phi <...> [ <...>, %EqualBB ]
// <...>
// The basic CFG has been restored! Now let's merge redundant basic blocks.
// Merge phony successor basic block into it's only predecessor,
// phony preheader basic block. It is fully pointlessly redundant.
MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU);
// By now we have: (5/6)
// PreheaderBB: ; preds = ???
// <...>
// %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
// %ComparedEqual = icmp eq <...> %memcmp, 0
// br label %DispatchBB
// DispatchBB: ; preds = %PreheaderBB
// br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
// EqualBB: ; preds = %DispatchBB
// br label %Successor1BB
// UnequalBB: ; preds = %DispatchBB
// br label %Successor0BB
// Successor0BB: ; preds = %UnequalBB
// %S0PHI = phi <...> [ <...>, %UnequalBB ]
// <...>
// Successor1BB: ; preds = %EqualBB
// %S0PHI = phi <...> [ <...>, %EqualBB ]
// <...>
// Was this loop nested?
if (!ParentLoop) {
// If the loop was *NOT* nested, then let's also merge phony successor
// basic block into it's only predecessor, preheader basic block.
// Also, here we need to update LoopInfo.
LI->removeBlock(PreheaderBB);
MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU);
// By now we have: (6/6)
// DispatchBB: ; preds = ???
// <...>
// %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
// %ComparedEqual = icmp eq <...> %memcmp, 0
// br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
// EqualBB: ; preds = %DispatchBB
// br label %Successor1BB
// UnequalBB: ; preds = %DispatchBB
// br label %Successor0BB
// Successor0BB: ; preds = %UnequalBB
// %S0PHI = phi <...> [ <...>, %UnequalBB ]
// <...>
// Successor1BB: ; preds = %EqualBB
// %S0PHI = phi <...> [ <...>, %EqualBB ]
// <...>
return DispatchBB;
}
// Otherwise, we need to "preserve" the LoopSimplify form of the deleted loop.
// To achieve that, we shall keep the preheader basic block (mainly so that
// the loop header block will be guaranteed to have a predecessor outside of
// the loop), and create a phony loop with all these new three basic blocks.
Loop *PhonyLoop = LI->AllocateLoop();
ParentLoop->addChildLoop(PhonyLoop);
PhonyLoop->addBasicBlockToLoop(DispatchBB, *LI);
PhonyLoop->addBasicBlockToLoop(ComparedEqualBB, *LI);
PhonyLoop->addBasicBlockToLoop(ComparedUnequalBB, *LI);
// But we only have a preheader basic block, a header basic block block and
// two exiting basic blocks. For a proper loop we also need a backedge from
// non-header basic block to header bb.
// Let's just add a never-taken branch from both of the exiting basic blocks.
for (BasicBlock *BB : {ComparedEqualBB, ComparedUnequalBB}) {
BranchInst *OldTerminator = cast<BranchInst>(BB->getTerminator());
assert(OldTerminator->isUnconditional() && "That's the one we created.");
BasicBlock *SuccessorBB = OldTerminator->getSuccessor(0);
IRBuilder<> Builder(OldTerminator);
Builder.SetCurrentDebugLocation(OldTerminator->getDebugLoc());
Builder.CreateCondBr(ConstantInt::getTrue(Context), SuccessorBB,
DispatchBB);
OldTerminator->eraseFromParent();
// Yes, the backedge will never be taken. The control-flow is redundant.
// If it can be simplified further, other passes will take care.
DTUpdates.push_back({DominatorTree::Delete, BB, SuccessorBB});
DTUpdates.push_back({DominatorTree::Insert, BB, SuccessorBB});
DTUpdates.push_back({DominatorTree::Insert, BB, DispatchBB});
}
assert(DTUpdates.size() == 6 && "Update count prediction failed.");
DTU.applyUpdates(DTUpdates);
DTUpdates.clear();
// By now we have: (6/6)
// PreheaderBB: <preheader> ; preds = ???
// <...>
// %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
// %ComparedEqual = icmp eq <...> %memcmp, 0
// br label %BCmpDispatchBB
// BCmpDispatchBB: <header> ; preds = %PreheaderBB
// br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
// EqualBB: <latch,exiting> ; preds = %BCmpDispatchBB
// br i1 %true, label %Successor1BB, label %BCmpDispatchBB
// UnequalBB: <latch,exiting> ; preds = %BCmpDispatchBB
// br i1 %true, label %Successor0BB, label %BCmpDispatchBB
// Successor0BB: ; preds = %UnequalBB
// %S0PHI = phi <...> [ <...>, %UnequalBB ]
// <...>
// Successor1BB: ; preds = %EqualBB
// %S0PHI = phi <...> [ <...>, %EqualBB ]
// <...>
// Finally fully DONE!
return DispatchBB;
}
void LoopIdiomRecognize::transformLoopToBCmp(ICmpInst *BCmpInst,
CmpInst *LatchCmpInst,
LoadInst *LoadA, LoadInst *LoadB,
const SCEV *SrcA, const SCEV *SrcB,
const SCEV *NBytes) {
// We will be inserting before the terminator instruction of preheader block.
IRBuilder<> Builder(CurLoop->getLoopPreheader()->getTerminator());
LLVM_DEBUG(dbgs() << "Transforming bcmp loop idiom into a call.\n");
LLVM_DEBUG(dbgs() << "Emitting new instructions.\n");
// Expand the SCEV expressions for both sources to compare, and produce value
// for the byte len (beware of Iterations potentially being a pointer, and
// account for element size being BCmpTyBytes bytes, which may be not 1 byte)
Value *PtrA, *PtrB, *Len;
{
SCEVExpander SExp(*SE, *DL, "LoopToBCmp");
SExp.setInsertPoint(&*Builder.GetInsertPoint());
auto HandlePtr = [&SExp](LoadInst *Load, const SCEV *Src) {
SExp.SetCurrentDebugLocation(DebugLoc());
// If the pointer operand of original load had dbgloc - use it.
if (const auto *I = dyn_cast<Instruction>(Load->getPointerOperand()))
SExp.SetCurrentDebugLocation(I->getDebugLoc());
return SExp.expandCodeFor(Src);
};
PtrA = HandlePtr(LoadA, SrcA);
PtrB = HandlePtr(LoadB, SrcB);
// For len calculation let's use dbgloc for the loop's latch condition.
Builder.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc());
SExp.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc());
Len = SExp.expandCodeFor(NBytes);
Type *CmpFuncSizeTy = DL->getIntPtrType(Builder.getContext());
assert(SE->getTypeSizeInBits(Len->getType()) ==
DL->getTypeSizeInBits(CmpFuncSizeTy) &&
"Len should already have the correct size.");
// Make sure that iteration count is a number, insert ptrtoint cast if not.
if (Len->getType()->isPointerTy())
Len = Builder.CreatePtrToInt(Len, CmpFuncSizeTy);
assert(Len->getType() == CmpFuncSizeTy && "Should have correct type now.");
Len->setName(Len->getName() + ".bytecount");
// There is no legality check needed. We want to compare that the memory
// regions [PtrA, PtrA+Len) and [PtrB, PtrB+Len) are fully identical, equal.
// For them to be fully equal, they must match bit-by-bit. And likewise,
// for them to *NOT* be fully equal, they have to differ just by one bit.
// The step of comparison (bits compared at once) simply does not matter.
}
// For the rest of new instructions, dbgloc should point at the value cmp.
Builder.SetCurrentDebugLocation(BCmpInst->getDebugLoc());
// Emit the comparison itself.
auto *CmpCall =
cast<CallInst>(HasBCmp ? emitBCmp(PtrA, PtrB, Len, Builder, *DL, TLI)
: emitMemCmp(PtrA, PtrB, Len, Builder, *DL, TLI));
// FIXME: add {B,Mem}CmpInst with MemoryCompareInst
// (based on MemIntrinsicBase) as base?
// FIXME: propagate metadata from loads? (alignments, AS, TBAA, ...)
// {b,mem}cmp returned 0 if they were equal, or non-zero if not equal.
auto *ComparedEqual = cast<ICmpInst>(Builder.CreateICmpEQ(
CmpCall, ConstantInt::get(CmpCall->getType(), 0),
PtrA->getName() + ".vs." + PtrB->getName() + ".eqcmp"));
BasicBlock *BB = transformBCmpControlFlow(ComparedEqual);
Builder.ClearInsertionPoint();
// We're done.
LLVM_DEBUG(dbgs() << "Transformed loop bcmp idiom into a call.\n");
ORE.emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "TransformedBCmpIdiomToCall",
CmpCall->getDebugLoc(), BB)
<< "Transformed bcmp idiom into a call to "
<< ore::NV("NewFunction", CmpCall->getCalledFunction())
<< "() function";
});
++NumBCmp;
}
/// Recognizes a bcmp idiom in a non-countable loop.
///
/// If detected, transforms the relevant code to issue the bcmp (or memcmp)
/// intrinsic function call, and returns true; otherwise, returns false.
bool LoopIdiomRecognize::recognizeBCmp() {
if (!HasMemCmp && !HasBCmp)
return false;
ICmpInst *BCmpInst;
CmpInst *LatchCmpInst;
LoadInst *LoadA, *LoadB;
const SCEV *SrcA, *SrcB, *NBytes;
if (!detectBCmpIdiom(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB,
NBytes)) {
LLVM_DEBUG(dbgs() << "bcmp idiom recognition failed.\n");
return false;
}
transformLoopToBCmp(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB, NBytes);
return true;
}

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -debugify -loop-idiom < %s -S 2>&1 | FileCheck %s
; RUN: opt -debugify -loop-idiom -pass-remarks=loop-idiom -pass-remarks-analysis=loop-idiom -verify -verify-each -verify-dom-info -verify-loop-info < %s -S 2>&1 | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
@ -23,38 +23,37 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; sink(std::equal(ptr0[i], ptr0[i] + count[i], ptr1[i]));
; }
; CHECK: remark: <stdin>:13:1: Loop recognized as a bcmp idiom
; CHECK: remark: <stdin>:11:1: Transformed bcmp idiom into a call to memcmp() function
; CHECK: remark: <stdin>:29:1: Loop recognized as a bcmp idiom
; CHECK: remark: <stdin>:34:1: Transformed bcmp idiom into a call to memcmp() function
define i1 @_Z43index_iteration_eq_variable_size_no_overlapPKcm(i8* nocapture %ptr, i64 %count) {
; CHECK-LABEL: @_Z43index_iteration_eq_variable_size_no_overlapPKcm(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[COUNT:%.*]], !dbg !22
; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[COUNT_BYTECOUNT:%.*]], !dbg !22
; CHECK-NEXT: call void @llvm.dbg.value(metadata i8* [[ADD_PTR]], metadata !9, metadata !DIExpression()), !dbg !22
; CHECK-NEXT: [[CMP14:%.*]] = icmp eq i64 [[COUNT]], 0, !dbg !23
; CHECK-NEXT: [[CMP14:%.*]] = icmp eq i64 [[COUNT_BYTECOUNT]], 0, !dbg !23
; CHECK-NEXT: call void @llvm.dbg.value(metadata i1 [[CMP14]], metadata !11, metadata !DIExpression()), !dbg !23
; CHECK-NEXT: br i1 [[CMP14]], label [[CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]], !dbg !24
; CHECK: for.body.preheader:
; CHECK-NEXT: br label [[FOR_BODY:%.*]], !dbg !25
; CHECK: for.cond:
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INC:%.*]], [[COUNT]], !dbg !26
; CHECK-NEXT: call void @llvm.dbg.value(metadata i1 [[CMP]], metadata !13, metadata !DIExpression()), !dbg !26
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[CLEANUP_LOOPEXIT:%.*]], !dbg !27
; CHECK: for.body:
; CHECK-NEXT: [[I_015:%.*]] = phi i64 [ [[INC]], [[FOR_COND:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ], !dbg !28
; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 [[I_015]], metadata !14, metadata !DIExpression()), !dbg !28
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[I_015]], !dbg !29
; CHECK-NEXT: call void @llvm.dbg.value(metadata i8* [[ARRAYIDX]], metadata !15, metadata !DIExpression()), !dbg !29
; CHECK-NEXT: [[V0:%.*]] = load i8, i8* [[ARRAYIDX]], !dbg !30
; CHECK-NEXT: call void @llvm.dbg.value(metadata i8 [[V0]], metadata !16, metadata !DIExpression()), !dbg !30
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 [[I_015]], !dbg !31
; CHECK-NEXT: call void @llvm.dbg.value(metadata i8* [[ARRAYIDX1]], metadata !17, metadata !DIExpression()), !dbg !31
; CHECK-NEXT: [[V1:%.*]] = load i8, i8* [[ARRAYIDX1]], !dbg !32
; CHECK-NEXT: call void @llvm.dbg.value(metadata i8 [[V1]], metadata !18, metadata !DIExpression()), !dbg !32
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[V0]], [[V1]], !dbg !33
; CHECK-NEXT: call void @llvm.dbg.value(metadata i1 [[CMP3]], metadata !19, metadata !DIExpression()), !dbg !33
; CHECK-NEXT: [[INC]] = add nuw i64 [[I_015]], 1, !dbg !34
; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 [[INC]], metadata !20, metadata !DIExpression()), !dbg !34
; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_COND]], label [[CLEANUP_LOOPEXIT]], !dbg !25
; CHECK-NEXT: br i1 [[CMP14]], label [[CLEANUP:%.*]], label [[FOR_BODY_BCMPDISPATCHBB:%.*]], !dbg !24
; CHECK: for.body.bcmpdispatchbb:
; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[PTR]], i8* [[ADD_PTR]], i64 [[COUNT_BYTECOUNT]]), !dbg !25
; CHECK-NEXT: [[PTR_VS_ADD_PTR_EQCMP:%.*]] = icmp eq i32 [[MEMCMP]], 0, !dbg !25
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !14, metadata !DIExpression()), !dbg !26
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !15, metadata !DIExpression()), !dbg !27
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !16, metadata !DIExpression()), !dbg !28
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !17, metadata !DIExpression()), !dbg !29
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !18, metadata !DIExpression()), !dbg !30
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !19, metadata !DIExpression()), !dbg !25
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !20, metadata !DIExpression()), !dbg !31
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !13, metadata !DIExpression()), !dbg !32
; CHECK-NEXT: br i1 [[PTR_VS_ADD_PTR_EQCMP]], label [[PTR_VS_ADD_PTR_EQCMP_EQUALBB:%.*]], label [[PTR_VS_ADD_PTR_EQCMP_UNEQUALBB:%.*]], !dbg !25
; CHECK: ptr.vs.add.ptr.eqcmp.equalbb:
; CHECK-NEXT: br label [[CLEANUP_LOOPEXIT:%.*]], !dbg !33
; CHECK: ptr.vs.add.ptr.eqcmp.unequalbb:
; CHECK-NEXT: br label [[CLEANUP_LOOPEXIT]], !dbg !34
; CHECK: cleanup.loopexit:
; CHECK-NEXT: [[RES_PH:%.*]] = phi i1 [ false, [[FOR_BODY]] ], [ true, [[FOR_COND]] ]
; CHECK-NEXT: [[RES_PH:%.*]] = phi i1 [ false, [[PTR_VS_ADD_PTR_EQCMP_UNEQUALBB]] ], [ true, [[PTR_VS_ADD_PTR_EQCMP_EQUALBB]] ]
; CHECK-NEXT: br label [[CLEANUP]], !dbg !35
; CHECK: cleanup:
; CHECK-NEXT: [[RES:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ [[RES_PH]], [[CLEANUP_LOOPEXIT]] ], !dbg !36
@ -106,11 +105,11 @@ define void @_Z16loop_within_loopmPPKcS1_Pm(i64 %outer_count, i8** %ptr0, i8** %
; CHECK-NEXT: call void @llvm.dbg.value(metadata i8* [[T0]], metadata !42, metadata !DIExpression()), !dbg !66
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[COUNT:%.*]], i64 [[I_012]], !dbg !67
; CHECK-NEXT: call void @llvm.dbg.value(metadata i64* [[ARRAYIDX2]], metadata !43, metadata !DIExpression()), !dbg !67
; CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[ARRAYIDX2]], !dbg !68
; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 [[T1]], metadata !44, metadata !DIExpression()), !dbg !68
; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 [[T1]], !dbg !69
; CHECK-NEXT: [[T1_BYTECOUNT:%.*]] = load i64, i64* [[ARRAYIDX2]], !dbg !68
; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 [[T1_BYTECOUNT]], metadata !44, metadata !DIExpression()), !dbg !68
; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 [[T1_BYTECOUNT]], !dbg !69
; CHECK-NEXT: call void @llvm.dbg.value(metadata i8* [[ADD_PTR]], metadata !45, metadata !DIExpression()), !dbg !69
; CHECK-NEXT: [[CMP5_I_I:%.*]] = icmp eq i64 [[T1]], 0, !dbg !70
; CHECK-NEXT: [[CMP5_I_I:%.*]] = icmp eq i64 [[T1_BYTECOUNT]], 0, !dbg !70
; CHECK-NEXT: call void @llvm.dbg.value(metadata i1 [[CMP5_I_I]], metadata !46, metadata !DIExpression()), !dbg !70
; CHECK-NEXT: br i1 [[CMP5_I_I]], label [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT]], label [[FOR_BODY_I_I_PREHEADER:%.*]], !dbg !62
; CHECK: for.body.i.i.preheader:
@ -118,39 +117,35 @@ define void @_Z16loop_within_loopmPPKcS1_Pm(i64 %outer_count, i8** %ptr0, i8** %
; CHECK-NEXT: call void @llvm.dbg.value(metadata i8** [[ARRAYIDX3]], metadata !47, metadata !DIExpression()), !dbg !71
; CHECK-NEXT: [[T2:%.*]] = load i8*, i8** [[ARRAYIDX3]], !dbg !72
; CHECK-NEXT: call void @llvm.dbg.value(metadata i8* [[T2]], metadata !48, metadata !DIExpression()), !dbg !72
; CHECK-NEXT: br label [[FOR_BODY_I_I:%.*]], !dbg !73
; CHECK: for.body.i.i:
; CHECK-NEXT: [[__FIRST2_ADDR_07_I_I:%.*]] = phi i8* [ [[INCDEC_PTR1_I_I:%.*]], [[FOR_INC_I_I:%.*]] ], [ [[T2]], [[FOR_BODY_I_I_PREHEADER]] ], !dbg !74
; CHECK-NEXT: [[__FIRST1_ADDR_06_I_I:%.*]] = phi i8* [ [[INCDEC_PTR_I_I:%.*]], [[FOR_INC_I_I]] ], [ [[T0]], [[FOR_BODY_I_I_PREHEADER]] ], !dbg !75
; CHECK-NEXT: call void @llvm.dbg.value(metadata i8* [[__FIRST2_ADDR_07_I_I]], metadata !49, metadata !DIExpression()), !dbg !74
; CHECK-NEXT: call void @llvm.dbg.value(metadata i8* [[__FIRST1_ADDR_06_I_I]], metadata !50, metadata !DIExpression()), !dbg !75
; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[__FIRST1_ADDR_06_I_I]], !dbg !76
; CHECK-NEXT: call void @llvm.dbg.value(metadata i8 [[T3]], metadata !51, metadata !DIExpression()), !dbg !76
; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[__FIRST2_ADDR_07_I_I]], !dbg !77
; CHECK-NEXT: call void @llvm.dbg.value(metadata i8 [[T4]], metadata !52, metadata !DIExpression()), !dbg !77
; CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp eq i8 [[T3]], [[T4]], !dbg !78
; CHECK-NEXT: call void @llvm.dbg.value(metadata i1 [[CMP_I_I_I]], metadata !53, metadata !DIExpression()), !dbg !78
; CHECK-NEXT: br i1 [[CMP_I_I_I]], label [[FOR_INC_I_I]], label [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT_LOOPEXIT:%.*]], !dbg !79
; CHECK: for.inc.i.i:
; CHECK-NEXT: [[INCDEC_PTR_I_I]] = getelementptr inbounds i8, i8* [[__FIRST1_ADDR_06_I_I]], i64 1, !dbg !80
; CHECK-NEXT: call void @llvm.dbg.value(metadata i8* [[INCDEC_PTR_I_I]], metadata !54, metadata !DIExpression()), !dbg !80
; CHECK-NEXT: [[INCDEC_PTR1_I_I]] = getelementptr inbounds i8, i8* [[__FIRST2_ADDR_07_I_I]], i64 1, !dbg !81
; CHECK-NEXT: call void @llvm.dbg.value(metadata i8* [[INCDEC_PTR1_I_I]], metadata !55, metadata !DIExpression()), !dbg !81
; CHECK-NEXT: [[CMP_I_I:%.*]] = icmp eq i8* [[INCDEC_PTR_I_I]], [[ADD_PTR]], !dbg !82
; CHECK-NEXT: call void @llvm.dbg.value(metadata i1 [[CMP_I_I]], metadata !56, metadata !DIExpression()), !dbg !82
; CHECK-NEXT: br i1 [[CMP_I_I]], label [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT_LOOPEXIT]], label [[FOR_BODY_I_I]], !dbg !83
; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[T0]], i8* [[T2]], i64 [[T1_BYTECOUNT]]), !dbg !73
; CHECK-NEXT: [[T0_VS_T2_EQCMP:%.*]] = icmp eq i32 [[MEMCMP]], 0, !dbg !73
; CHECK-NEXT: br label [[FOR_BODY_I_I_BCMPDISPATCHBB:%.*]]
; CHECK: for.body.i.i.bcmpdispatchbb:
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !49, metadata !DIExpression()), !dbg !74
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !50, metadata !DIExpression()), !dbg !75
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !51, metadata !DIExpression()), !dbg !76
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !52, metadata !DIExpression()), !dbg !77
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !53, metadata !DIExpression()), !dbg !73
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !54, metadata !DIExpression()), !dbg !78
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !55, metadata !DIExpression()), !dbg !79
; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 undef, metadata !56, metadata !DIExpression()), !dbg !80
; CHECK-NEXT: br i1 [[T0_VS_T2_EQCMP]], label [[T0_VS_T2_EQCMP_EQUALBB:%.*]], label [[T0_VS_T2_EQCMP_UNEQUALBB:%.*]], !dbg !73
; CHECK: t0.vs.t2.eqcmp.equalbb:
; CHECK-NEXT: br i1 true, label [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT_LOOPEXIT:%.*]], label [[FOR_BODY_I_I_BCMPDISPATCHBB]], !dbg !81
; CHECK: t0.vs.t2.eqcmp.unequalbb:
; CHECK-NEXT: br i1 true, label [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT_LOOPEXIT]], label [[FOR_BODY_I_I_BCMPDISPATCHBB]], !dbg !82
; CHECK: _ZNSt3__15equalIPKcS2_EEbT_S3_T0_.exit.loopexit:
; CHECK-NEXT: [[RETVAL_0_I_I_PH:%.*]] = phi i1 [ false, [[FOR_BODY_I_I]] ], [ true, [[FOR_INC_I_I]] ]
; CHECK-NEXT: br label [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT]], !dbg !84
; CHECK-NEXT: [[RETVAL_0_I_I_PH:%.*]] = phi i1 [ false, [[T0_VS_T2_EQCMP_UNEQUALBB]] ], [ true, [[T0_VS_T2_EQCMP_EQUALBB]] ]
; CHECK-NEXT: br label [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT]], !dbg !83
; CHECK: _ZNSt3__15equalIPKcS2_EEbT_S3_T0_.exit:
; CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi i1 [ true, [[FOR_BODY]] ], [ [[RETVAL_0_I_I_PH]], [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT_LOOPEXIT]] ], !dbg !85
; CHECK-NEXT: call void @llvm.dbg.value(metadata i1 [[RETVAL_0_I_I]], metadata !57, metadata !DIExpression()), !dbg !85
; CHECK-NEXT: tail call void @_Z4sinkb(i1 [[RETVAL_0_I_I]]), !dbg !84
; CHECK-NEXT: [[INC]] = add nuw i64 [[I_012]], 1, !dbg !86
; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 [[INC]], metadata !58, metadata !DIExpression()), !dbg !86
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[INC]], [[OUTER_COUNT]], !dbg !87
; CHECK-NEXT: call void @llvm.dbg.value(metadata i1 [[CMP]], metadata !59, metadata !DIExpression()), !dbg !87
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg !88
; CHECK-NEXT: [[RETVAL_0_I_I:%.*]] = phi i1 [ true, [[FOR_BODY]] ], [ [[RETVAL_0_I_I_PH]], [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT_LOOPEXIT]] ], !dbg !84
; CHECK-NEXT: call void @llvm.dbg.value(metadata i1 [[RETVAL_0_I_I]], metadata !57, metadata !DIExpression()), !dbg !84
; CHECK-NEXT: tail call void @_Z4sinkb(i1 [[RETVAL_0_I_I]]), !dbg !83
; CHECK-NEXT: [[INC]] = add nuw i64 [[I_012]], 1, !dbg !85
; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 [[INC]], metadata !58, metadata !DIExpression()), !dbg !85
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[INC]], [[OUTER_COUNT]], !dbg !86
; CHECK-NEXT: call void @llvm.dbg.value(metadata i1 [[CMP]], metadata !59, metadata !DIExpression()), !dbg !86
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg !87
;
entry:
%cmp11 = icmp eq i64 %outer_count, 0

View File

@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-idiom < %s -S | FileCheck %s
; RUN: opt -loop-idiom -verify -verify-each -verify-dom-info -verify-loop-info < %s -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"