[LoopIdiomRecognize] Recommit: BCmp loop idiom recognition

Summary: This is a recommit, this originally landed in rL370454 but was subsequently reverted in rL370788 due to https://bugs.llvm.org/show_bug.cgi?id=43206 The reduced testcase was added to bcmp-negative-tests.ll as @pr43206_different_loops - we must ensure that the SCEV's we got are both for the same loop we are currently investigating. Original commit message: @mclow.lists brought up this issue up in IRC. It is a reasonably common problem to compare some two values for equality. Those may be just some integers, strings or arrays of integers. In C, there is `memcmp()`, `bcmp()` functions. In C++, there exists `std::equal()` algorithm. One can also write that function manually. libstdc++'s `std::equal()` is specialized to directly call `memcmp()` for various types, but not `std::byte` from C++2a. https://godbolt.org/z/mx2ejJ libc++ does not do anything like that, it simply relies on simple C++'s `operator==()`. https://godbolt.org/z/er0Zwf (GOOD!) So likely, there exists a certain performance opportunities. Let's compare performance of naive `std::equal()` (no `memcmp()`) with one that is using `memcmp()` (in this case, compiled with modified compiler). {F8768213} ``` #include <algorithm> #include <cmath> #include <cstdint> #include <iterator> #include <limits> #include <random> #include <type_traits> #include <utility> #include <vector> #include "benchmark/benchmark.h" template <class T> bool equal(T* a, T* a_end, T* b) noexcept { for (; a != a_end; ++a, ++b) { if (*a != *b) return false; } return true; } template <typename T> std::vector<T> getVectorOfRandomNumbers(size_t count) { std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<T> dis(std::numeric_limits<T>::min(), std::numeric_limits<T>::max()); std::vector<T> v; v.reserve(count); std::generate_n(std::back_inserter(v), count, [&dis, &gen]() { return dis(gen); }); assert(v.size() == count); return v; } struct Identical { template <typename T> static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) { auto Tmp = getVectorOfRandomNumbers<T>(count); return std::make_pair(Tmp, std::move(Tmp)); } }; struct InequalHalfway { template <typename T> static std::pair<std::vector<T>, std::vector<T>> Gen(size_t count) { auto V0 = getVectorOfRandomNumbers<T>(count); auto V1 = V0; V1[V1.size() / size_t(2)]++; // just change the value. return std::make_pair(std::move(V0), std::move(V1)); } }; template <class T, class Gen> void BM_bcmp(benchmark::State& state) { const size_t Length = state.range(0); const std::pair<std::vector<T>, std::vector<T>> Data = Gen::template Gen<T>(Length); const std::vector<T>& a = Data.first; const std::vector<T>& b = Data.second; assert(a.size() == Length && b.size() == a.size()); benchmark::ClobberMemory(); benchmark::DoNotOptimize(a); benchmark::DoNotOptimize(a.data()); benchmark::DoNotOptimize(b); benchmark::DoNotOptimize(b.data()); for (auto _ : state) { const bool is_equal = equal(a.data(), a.data() + a.size(), b.data()); benchmark::DoNotOptimize(is_equal); } state.SetComplexityN(Length); state.counters["eltcnt"] = benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariant); state.counters["eltcnt/sec"] = benchmark::Counter(Length, benchmark::Counter::kIsIterationInvariantRate); const size_t BytesRead = 2 * sizeof(T) * Length; state.counters["bytes_read/iteration"] = benchmark::Counter(BytesRead, benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024); state.counters["bytes_read/sec"] = benchmark::Counter( BytesRead, benchmark::Counter::kIsIterationInvariantRate, benchmark::Counter::OneK::kIs1024); } template <typename T> static void CustomArguments(benchmark::internal::Benchmark* b) { const size_t L2SizeBytes = []() { for (const benchmark::CPUInfo::CacheInfo& I : benchmark::CPUInfo::Get().caches) { if (I.level == 2) return I.size; } return 0; }(); // What is the largest range we can check to always fit within given L2 cache? const size_t MaxLen = L2SizeBytes / /*total bufs*/ 2 / /*maximal elt size*/ sizeof(T) / /*safety margin*/ 2; b->RangeMultiplier(2)->Range(1, MaxLen)->Complexity(benchmark::oN); } BENCHMARK_TEMPLATE(BM_bcmp, uint8_t, Identical) ->Apply(CustomArguments<uint8_t>); BENCHMARK_TEMPLATE(BM_bcmp, uint16_t, Identical) ->Apply(CustomArguments<uint16_t>); BENCHMARK_TEMPLATE(BM_bcmp, uint32_t, Identical) ->Apply(CustomArguments<uint32_t>); BENCHMARK_TEMPLATE(BM_bcmp, uint64_t, Identical) ->Apply(CustomArguments<uint64_t>); BENCHMARK_TEMPLATE(BM_bcmp, uint8_t, InequalHalfway) ->Apply(CustomArguments<uint8_t>); BENCHMARK_TEMPLATE(BM_bcmp, uint16_t, InequalHalfway) ->Apply(CustomArguments<uint16_t>); BENCHMARK_TEMPLATE(BM_bcmp, uint32_t, InequalHalfway) ->Apply(CustomArguments<uint32_t>); BENCHMARK_TEMPLATE(BM_bcmp, uint64_t, InequalHalfway) ->Apply(CustomArguments<uint64_t>); ``` {F8768210} ``` $ ~/src/googlebenchmark/tools/compare.py --no-utest benchmarks build-{old,new}/test/llvm-bcmp-bench RUNNING: build-old/test/llvm-bcmp-bench --benchmark_out=/tmp/tmpb6PEUx 2019-04-25 21:17:11 Running build-old/test/llvm-bcmp-bench Run on (8 X 4000 MHz CPU s) CPU Caches: L1 Data 16K (x8) L1 Instruction 64K (x4) L2 Unified 2048K (x4) L3 Unified 8192K (x1) Load Average: 0.65, 3.90, 4.14 --------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... --------------------------------------------------------------------------------------------------- <...> BM_bcmp<uint8_t, Identical>/512000 432131 ns 432101 ns 1613 bytes_read/iteration=1000k bytes_read/sec=2.20706G/s eltcnt=825.856M eltcnt/sec=1.18491G/s BM_bcmp<uint8_t, Identical>_BigO 0.86 N 0.86 N BM_bcmp<uint8_t, Identical>_RMS 8 % 8 % <...> BM_bcmp<uint16_t, Identical>/256000 161408 ns 161409 ns 4027 bytes_read/iteration=1000k bytes_read/sec=5.90843G/s eltcnt=1030.91M eltcnt/sec=1.58603G/s BM_bcmp<uint16_t, Identical>_BigO 0.67 N 0.67 N BM_bcmp<uint16_t, Identical>_RMS 25 % 25 % <...> BM_bcmp<uint32_t, Identical>/128000 81497 ns 81488 ns 8415 bytes_read/iteration=1000k bytes_read/sec=11.7032G/s eltcnt=1077.12M eltcnt/sec=1.57078G/s BM_bcmp<uint32_t, Identical>_BigO 0.71 N 0.71 N BM_bcmp<uint32_t, Identical>_RMS 42 % 42 % <...> BM_bcmp<uint64_t, Identical>/64000 50138 ns 50138 ns 10909 bytes_read/iteration=1000k bytes_read/sec=19.0209G/s eltcnt=698.176M eltcnt/sec=1.27647G/s BM_bcmp<uint64_t, Identical>_BigO 0.84 N 0.84 N BM_bcmp<uint64_t, Identical>_RMS 27 % 27 % <...> BM_bcmp<uint8_t, InequalHalfway>/512000 192405 ns 192392 ns 3638 bytes_read/iteration=1000k bytes_read/sec=4.95694G/s eltcnt=1.86266G eltcnt/sec=2.66124G/s BM_bcmp<uint8_t, InequalHalfway>_BigO 0.38 N 0.38 N BM_bcmp<uint8_t, InequalHalfway>_RMS 3 % 3 % <...> BM_bcmp<uint16_t, InequalHalfway>/256000 127858 ns 127860 ns 5477 bytes_read/iteration=1000k bytes_read/sec=7.45873G/s eltcnt=1.40211G eltcnt/sec=2.00219G/s BM_bcmp<uint16_t, InequalHalfway>_BigO 0.50 N 0.50 N BM_bcmp<uint16_t, InequalHalfway>_RMS 0 % 0 % <...> BM_bcmp<uint32_t, InequalHalfway>/128000 49140 ns 49140 ns 14281 bytes_read/iteration=1000k bytes_read/sec=19.4072G/s eltcnt=1.82797G eltcnt/sec=2.60478G/s BM_bcmp<uint32_t, InequalHalfway>_BigO 0.40 N 0.40 N BM_bcmp<uint32_t, InequalHalfway>_RMS 18 % 18 % <...> BM_bcmp<uint64_t, InequalHalfway>/64000 32101 ns 32099 ns 21786 bytes_read/iteration=1000k bytes_read/sec=29.7101G/s eltcnt=1.3943G eltcnt/sec=1.99381G/s BM_bcmp<uint64_t, InequalHalfway>_BigO 0.50 N 0.50 N BM_bcmp<uint64_t, InequalHalfway>_RMS 1 % 1 % RUNNING: build-new/test/llvm-bcmp-bench --benchmark_out=/tmp/tmpQ46PP0 2019-04-25 21:19:29 Running build-new/test/llvm-bcmp-bench Run on (8 X 4000 MHz CPU s) CPU Caches: L1 Data 16K (x8) L1 Instruction 64K (x4) L2 Unified 2048K (x4) L3 Unified 8192K (x1) Load Average: 1.01, 2.85, 3.71 --------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... --------------------------------------------------------------------------------------------------- <...> BM_bcmp<uint8_t, Identical>/512000 18593 ns 18590 ns 37565 bytes_read/iteration=1000k bytes_read/sec=51.2991G/s eltcnt=19.2333G eltcnt/sec=27.541G/s BM_bcmp<uint8_t, Identical>_BigO 0.04 N 0.04 N BM_bcmp<uint8_t, Identical>_RMS 37 % 37 % <...> BM_bcmp<uint16_t, Identical>/256000 18950 ns 18948 ns 37223 bytes_read/iteration=1000k bytes_read/sec=50.3324G/s eltcnt=9.52909G eltcnt/sec=13.511G/s BM_bcmp<uint16_t, Identical>_BigO 0.08 N 0.08 N BM_bcmp<uint16_t, Identical>_RMS 34 % 34 % <...> BM_bcmp<uint32_t, Identical>/128000 18627 ns 18627 ns 37895 bytes_read/iteration=1000k bytes_read/sec=51.198G/s eltcnt=4.85056G eltcnt/sec=6.87168G/s BM_bcmp<uint32_t, Identical>_BigO 0.16 N 0.16 N BM_bcmp<uint32_t, Identical>_RMS 35 % 35 % <...> BM_bcmp<uint64_t, Identical>/64000 18855 ns 18855 ns 37458 bytes_read/iteration=1000k bytes_read/sec=50.5791G/s eltcnt=2.39731G eltcnt/sec=3.3943G/s BM_bcmp<uint64_t, Identical>_BigO 0.32 N 0.32 N BM_bcmp<uint64_t, Identical>_RMS 33 % 33 % <...> BM_bcmp<uint8_t, InequalHalfway>/512000 9570 ns 9569 ns 73500 bytes_read/iteration=1000k bytes_read/sec=99.6601G/s eltcnt=37.632G eltcnt/sec=53.5046G/s BM_bcmp<uint8_t, InequalHalfway>_BigO 0.02 N 0.02 N BM_bcmp<uint8_t, InequalHalfway>_RMS 29 % 29 % <...> BM_bcmp<uint16_t, InequalHalfway>/256000 9547 ns 9547 ns 74343 bytes_read/iteration=1000k bytes_read/sec=99.8971G/s eltcnt=19.0318G eltcnt/sec=26.8159G/s BM_bcmp<uint16_t, InequalHalfway>_BigO 0.04 N 0.04 N BM_bcmp<uint16_t, InequalHalfway>_RMS 29 % 29 % <...> BM_bcmp<uint32_t, InequalHalfway>/128000 9396 ns 9394 ns 73521 bytes_read/iteration=1000k bytes_read/sec=101.518G/s eltcnt=9.41069G eltcnt/sec=13.6255G/s BM_bcmp<uint32_t, InequalHalfway>_BigO 0.08 N 0.08 N BM_bcmp<uint32_t, InequalHalfway>_RMS 30 % 30 % <...> BM_bcmp<uint64_t, InequalHalfway>/64000 9499 ns 9498 ns 73802 bytes_read/iteration=1000k bytes_read/sec=100.405G/s eltcnt=4.72333G eltcnt/sec=6.73808G/s BM_bcmp<uint64_t, InequalHalfway>_BigO 0.16 N 0.16 N BM_bcmp<uint64_t, InequalHalfway>_RMS 28 % 28 % Comparing build-old/test/llvm-bcmp-bench to build-new/test/llvm-bcmp-bench Benchmark Time CPU Time Old Time New CPU Old CPU New --------------------------------------------------------------------------------------------------------------------------------------- <...> BM_bcmp<uint8_t, Identical>/512000 -0.9570 -0.9570 432131 18593 432101 18590 <...> BM_bcmp<uint16_t, Identical>/256000 -0.8826 -0.8826 161408 18950 161409 18948 <...> BM_bcmp<uint32_t, Identical>/128000 -0.7714 -0.7714 81497 18627 81488 18627 <...> BM_bcmp<uint64_t, Identical>/64000 -0.6239 -0.6239 50138 18855 50138 18855 <...> BM_bcmp<uint8_t, InequalHalfway>/512000 -0.9503 -0.9503 192405 9570 192392 9569 <...> BM_bcmp<uint16_t, InequalHalfway>/256000 -0.9253 -0.9253 127858 9547 127860 9547 <...> BM_bcmp<uint32_t, InequalHalfway>/128000 -0.8088 -0.8088 49140 9396 49140 9394 <...> BM_bcmp<uint64_t, InequalHalfway>/64000 -0.7041 -0.7041 32101 9499 32099 9498 ``` What can we tell from the benchmark? * Performance of naive equality check somewhat improves with element size, maxing out at eltcnt/sec=1.58603G/s for uint16_t, or bytes_read/sec=19.0209G/s for uint64_t. I think, that instability implies performance problems. * Performance of `memcmp()`-aware benchmark always maxes out at around bytes_read/sec=51.2991G/s for every type. That is 2.6x the throughput of the naive variant! * eltcnt/sec metric for the `memcmp()`-aware benchmark maxes out at eltcnt/sec=27.541G/s for uint8_t (was: eltcnt/sec=1.18491G/s, so 24x) and linearly decreases with element size. For uint64_t, it's ~4x+ the elements/second. * The call obvious is more pricey than the loop, with small element count. As it can be seen from the full output {F8768210}, the `memcmp()` is almost universally worse, independent of the element size (and thus buffer size) when element count is less than 8. So all in all, bcmp idiom does indeed pose untapped performance headroom. This diff does implement said idiom recognition. I think a reasonable test coverage is present, but do tell if there is anything obvious missing. Now, quality. This does succeed to build and pass the test-suite, at least without any non-bundled elements. {F8768216} {F8768217} This transform fires 91 times: ``` $ /build/test-suite/utils/compare.py -m loop-idiom.NumBCmp result-new.json Tests: 1149 Metric: loop-idiom.NumBCmp Program result-new MultiSourc...Benchmarks/7zip/7zip-benchmark 79.00 MultiSource/Applications/d/make_dparser 3.00 SingleSource/UnitTests/vla 2.00 MultiSource/Applications/Burg/burg 1.00 MultiSourc.../Applications/JM/lencod/lencod 1.00 MultiSource/Applications/lemon/lemon 1.00 MultiSource/Benchmarks/Bullet/bullet 1.00 MultiSourc...e/Benchmarks/MallocBench/gs/gs 1.00 MultiSourc...gs-C/TimberWolfMC/timberwolfmc 1.00 MultiSourc...Prolangs-C/simulator/simulator 1.00 ``` The size changes are: I'm not sure what's going on with SingleSource/UnitTests/vla.test yet, did not look. ``` $ /build/test-suite/utils/compare.py -m size..text result-{old,new}.json --filter-hash Tests: 1149 Same hash: 907 (filtered out) Remaining: 242 Metric: size..text Program result-old result-new diff test-suite...ingleSource/UnitTests/vla.test 753.00 833.00 10.6% test-suite...marks/7zip/7zip-benchmark.test 1001697.00 966657.00 -3.5% test-suite...ngs-C/simulator/simulator.test 32369.00 32321.00 -0.1% test-suite...plications/d/make_dparser.test 89585.00 89505.00 -0.1% test-suite...ce/Applications/Burg/burg.test 40817.00 40785.00 -0.1% test-suite.../Applications/lemon/lemon.test 47281.00 47249.00 -0.1% test-suite...TimberWolfMC/timberwolfmc.test 250065.00 250113.00 0.0% test-suite...chmarks/MallocBench/gs/gs.test 149889.00 149873.00 -0.0% test-suite...ications/JM/lencod/lencod.test 769585.00 769569.00 -0.0% test-suite.../Benchmarks/Bullet/bullet.test 770049.00 770049.00 0.0% test-suite...HMARK_ANISTROPIC_DIFFUSION/128 NaN NaN nan% test-suite...HMARK_ANISTROPIC_DIFFUSION/256 NaN NaN nan% test-suite...CHMARK_ANISTROPIC_DIFFUSION/64 NaN NaN nan% test-suite...CHMARK_ANISTROPIC_DIFFUSION/32 NaN NaN nan% test-suite...ENCHMARK_BILATERAL_FILTER/64/4 NaN NaN nan% Geomean difference nan% result-old result-new diff count 1.000000e+01 10.00000 10.000000 mean 3.152090e+05 311695.40000 0.006749 std 3.790398e+05 372091.42232 0.036605 min 7.530000e+02 833.00000 -0.034981 25% 4.243300e+04 42401.00000 -0.000866 50% 1.197370e+05 119689.00000 -0.000392 75% 6.397050e+05 639705.00000 -0.000005 max 1.001697e+06 966657.00000 0.106242 ``` I don't have timings though. And now to the code. The basic idea is to completely replace the whole loop. If we can't fully kill it, don't transform. I have left one or two comments in the code, so hopefully it can be understood. Also, there is a few TODO's that i have left for follow-ups: * widening of `memcmp()`/`bcmp()` * step smaller than the comparison size * Metadata propagation * more than two blocks as long as there is still a single backedge? * ??? Reviewers: reames, fhahn, mkazantsev, chandlerc, craig.topper, courbet Reviewed By: courbet Subscribers: miyuki, hiraditya, xbolva00, nikic, jfb, gchatelet, courbet, llvm-commits, mclow.lists Tags: #llvm Differential Revision: https://reviews.llvm.org/D61144 llvm-svn: 374662
2024-11-23 03:02:36 +01:00 · 2019-10-12 15:35:32 +00:00 · 2019-10-12 15:35:32 +00:00 · 0dff68630e
commit 0dff68630e
parent 7d56790f6f
5 changed files with 1280 additions and 603 deletions
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@ -66,6 +66,9 @@ Non-comprehensive list of changes in this release
  Undefined Behaviour Sanitizer ``-fsanitize=pointer-overflow`` check
  will now catch such cases.

+* The Loop Idiom Recognition (``-loop-idiom``) pass has learned to recognize
+  ``bcmp`` pattern, and convert it into a call to ``bcmp`` (or ``memcmp``)
+  function.

 Changes to the LLVM IR
 ----------------------
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@ -41,6 +41,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@ -77,16 +78,20 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@ -102,6 +107,7 @@ using namespace llvm;

 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
+STATISTIC(NumBCmp, "Number of memcmp's formed from loop 2xload+eq-compare");

 static cl::opt<bool> UseLIRCodeSizeHeurs(
    "use-lir-code-size-heurs",
@ -111,6 +117,26 @@ static cl::opt<bool> UseLIRCodeSizeHeurs(

 namespace {

+// FIXME: reinventing the wheel much? Is there a cleaner solution?
+struct PMAbstraction {
+  virtual void markLoopAsDeleted(Loop *L) = 0;
+  virtual ~PMAbstraction() = default;
+};
+struct LegacyPMAbstraction : PMAbstraction {
+  LPPassManager &LPM;
+  LegacyPMAbstraction(LPPassManager &LPM) : LPM(LPM) {}
+  virtual ~LegacyPMAbstraction() = default;
+  void markLoopAsDeleted(Loop *L) override { LPM.markLoopAsDeleted(*L); }
+};
+struct NewPMAbstraction : PMAbstraction {
+  LPMUpdater &Updater;
+  NewPMAbstraction(LPMUpdater &Updater) : Updater(Updater) {}
+  virtual ~NewPMAbstraction() = default;
+  void markLoopAsDeleted(Loop *L) override {
+    Updater.markLoopAsDeleted(*L, L->getName());
+  }
+};
+
 class LoopIdiomRecognize {
  Loop *CurLoop = nullptr;
  AliasAnalysis *AA;
@ -120,6 +146,7 @@ class LoopIdiomRecognize {
  TargetLibraryInfo *TLI;
  const TargetTransformInfo *TTI;
  const DataLayout *DL;
+  PMAbstraction &LoopDeleter;
  OptimizationRemarkEmitter &ORE;
  bool ApplyCodeSizeHeuristics;

@ -128,9 +155,10 @@ public:
                              LoopInfo *LI, ScalarEvolution *SE,
                              TargetLibraryInfo *TLI,
                              const TargetTransformInfo *TTI,
-                              const DataLayout *DL,
+                              const DataLayout *DL, PMAbstraction &LoopDeleter,
                              OptimizationRemarkEmitter &ORE)
-      : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {}
+      : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL),
+        LoopDeleter(LoopDeleter), ORE(ORE) {}

  bool runOnLoop(Loop *L);

@ -144,6 +172,8 @@ private:
  bool HasMemset;
  bool HasMemsetPattern;
  bool HasMemcpy;
+  bool HasMemCmp;
+  bool HasBCmp;

  /// Return code for isLegalStore()
  enum LegalStoreKind {
@ -186,6 +216,32 @@ private:

  bool runOnNoncountableLoop();

+  struct CmpLoopStructure {
+    Value *BCmpValue, *LatchCmpValue;
+    BasicBlock *HeaderBrEqualBB, *HeaderBrUnequalBB;
+    BasicBlock *LatchBrFinishBB, *LatchBrContinueBB;
+  };
+  bool matchBCmpLoopStructure(CmpLoopStructure &CmpLoop) const;
+  struct CmpOfLoads {
+    ICmpInst::Predicate BCmpPred;
+    Value *LoadSrcA, *LoadSrcB;
+    Value *LoadA, *LoadB;
+  };
+  bool matchBCmpOfLoads(Value *BCmpValue, CmpOfLoads &CmpOfLoads) const;
+  bool recognizeBCmpLoopControlFlow(const CmpOfLoads &CmpOfLoads,
+                                    CmpLoopStructure &CmpLoop) const;
+  bool recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes, CmpOfLoads &CmpOfLoads,
+                             const SCEV *&SrcA, const SCEV *&SrcB,
+                             const SCEV *&Iterations) const;
+  bool detectBCmpIdiom(ICmpInst *&BCmpInst, CmpInst *&LatchCmpInst,
+                       LoadInst *&LoadA, LoadInst *&LoadB, const SCEV *&SrcA,
+                       const SCEV *&SrcB, const SCEV *&NBytes) const;
+  BasicBlock *transformBCmpControlFlow(ICmpInst *ComparedEqual);
+  void transformLoopToBCmp(ICmpInst *BCmpInst, CmpInst *LatchCmpInst,
+                           LoadInst *LoadA, LoadInst *LoadB, const SCEV *SrcA,
+                           const SCEV *SrcB, const SCEV *NBytes);
+  bool recognizeBCmp();
+
  bool recognizePopcount();
  void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
                               PHINode *CntPhi, Value *Var);
@ -223,13 +279,14 @@ public:
        &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
            *L->getHeader()->getParent());
    const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
+    LegacyPMAbstraction LoopDeleter(LPM);

    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
    // pass.  Function analyses need to be preserved across loop transformations
    // but ORE cannot be preserved (see comment before the pass definition).
    OptimizationRemarkEmitter ORE(L->getHeader()->getParent());

-    LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, ORE);
+    LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, LoopDeleter, ORE);
    return LIR.runOnLoop(L);
  }

@ -248,7 +305,7 @@ char LoopIdiomRecognizeLegacyPass::ID = 0;

 PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
                                              LoopStandardAnalysisResults &AR,
-                                              LPMUpdater &) {
+                                              LPMUpdater &Updater) {
  const auto *DL = &L.getHeader()->getModule()->getDataLayout();

  const auto &FAM =
@ -262,8 +319,9 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
        "LoopIdiomRecognizePass: OptimizationRemarkEmitterAnalysis not cached "
        "at a higher level");

+  NewPMAbstraction LoopDeleter(Updater);
  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL,
-                         *ORE);
+                         LoopDeleter, *ORE);
  if (!LIR.runOnLoop(&L))
    return PreservedAnalyses::all();

@ -300,7 +358,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {

  // Disable loop idiom recognition if the function's name is a common idiom.
  StringRef Name = L->getHeader()->getParent()->getName();
-  if (Name == "memset" || Name == "memcpy")
+  if (Name == "memset" || Name == "memcpy" || Name == "memcmp" ||
+      Name == "bcmp")
    return false;

  // Determine if code size heuristics need to be applied.
@ -310,8 +369,10 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
  HasMemset = TLI->has(LibFunc_memset);
  HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
  HasMemcpy = TLI->has(LibFunc_memcpy);
+  HasMemCmp = TLI->has(LibFunc_memcmp);
+  HasBCmp = TLI->has(LibFunc_bcmp);

-  if (HasMemset || HasMemsetPattern || HasMemcpy)
+  if (HasMemset || HasMemsetPattern || HasMemcpy || HasMemCmp || HasBCmp)
    if (SE->hasLoopInvariantBackedgeTakenCount(L))
      return runOnCountableLoop();

@ -1150,7 +1211,7 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
                    << "] Noncountable Loop %"
                    << CurLoop->getHeader()->getName() << "\n");

-  return recognizePopcount() || recognizeAndInsertFFS();
+  return recognizeBCmp() || recognizePopcount() || recognizeAndInsertFFS();
 }

 /// Check if the given conditional branch is based on the comparison between
@ -1824,3 +1885,804 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
  //   loop. The loop would otherwise not be deleted even if it becomes empty.
  SE->forgetLoop(CurLoop);
 }
+
+bool LoopIdiomRecognize::matchBCmpLoopStructure(
+    CmpLoopStructure &CmpLoop) const {
+  ICmpInst::Predicate BCmpPred;
+
+  // We are looking for the following basic layout:
+  //  PreheaderBB: <preheader>              ; preds = ???
+  //    <...>
+  //    br label %LoopHeaderBB
+  //  LoopHeaderBB: <header,exiting>        ; preds = %PreheaderBB,%LoopLatchBB
+  //    <...>
+  //    %BCmpValue = icmp <...>
+  //    br i1 %BCmpValue, label %LoopLatchBB, label %Successor0
+  //  LoopLatchBB: <latch,exiting>          ; preds = %LoopHeaderBB
+  //    <...>
+  //    %LatchCmpValue = <are we done, or do next iteration?>
+  //    br i1 %LatchCmpValue, label %Successor1, label %LoopHeaderBB
+  //  Successor0: <exit>                    ; preds = %LoopHeaderBB
+  //    <...>
+  //  Successor1: <exit>                    ; preds = %LoopLatchBB
+  //    <...>
+  //
+  // Successor0 and Successor1 may or may not be the same basic block.
+
+  // Match basic frame-work of this supposedly-comparison loop.
+  using namespace PatternMatch;
+  if (!match(CurLoop->getHeader()->getTerminator(),
+             m_Br(m_CombineAnd(m_ICmp(BCmpPred, m_Value(), m_Value()),
+                               m_Value(CmpLoop.BCmpValue)),
+                  CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB)) ||
+      !match(CurLoop->getLoopLatch()->getTerminator(),
+             m_Br(m_CombineAnd(m_Cmp(), m_Value(CmpLoop.LatchCmpValue)),
+                  CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB))) {
+    LLVM_DEBUG(dbgs() << "Basic control-flow layout unrecognized.\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "Recognized basic control-flow layout.\n");
+  return true;
+}
+
+bool LoopIdiomRecognize::matchBCmpOfLoads(Value *BCmpValue,
+                                          CmpOfLoads &CmpOfLoads) const {
+  using namespace PatternMatch;
+  LLVM_DEBUG(dbgs() << "Analyzing header icmp " << *BCmpValue
+                    << "   as bcmp pattern.\n");
+
+  // Match bcmp-style loop header cmp. It must be an eq-icmp of loads. Example:
+  //    %v0 = load <...>, <...>* %LoadSrcA
+  //    %v1 = load <...>, <...>* %LoadSrcB
+  //    %CmpLoop.BCmpValue = icmp eq <...> %v0, %v1
+  // There won't be any no-op bitcasts between load and icmp,
+  // they would have been transformed into a load of bitcast.
+  // FIXME: {b,mem}cmp() calls have the same semantics as icmp. Match them too.
+  if (!match(BCmpValue,
+             m_ICmp(CmpOfLoads.BCmpPred,
+                    m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcA)),
+                                 m_Value(CmpOfLoads.LoadA)),
+                    m_CombineAnd(m_Load(m_Value(CmpOfLoads.LoadSrcB)),
+                                 m_Value(CmpOfLoads.LoadB)))) ||
+      !ICmpInst::isEquality(CmpOfLoads.BCmpPred)) {
+    LLVM_DEBUG(dbgs() << "Loop header icmp did not match bcmp pattern.\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "Recognized header icmp as bcmp pattern with loads:\n\t"
+                    << *CmpOfLoads.LoadA << "\n\t" << *CmpOfLoads.LoadB
+                    << "\n");
+  // FIXME: handle memcmp pattern?
+  return true;
+}
+
+bool LoopIdiomRecognize::recognizeBCmpLoopControlFlow(
+    const CmpOfLoads &CmpOfLoads, CmpLoopStructure &CmpLoop) const {
+  BasicBlock *LoopHeaderBB = CurLoop->getHeader();
+  BasicBlock *LoopLatchBB = CurLoop->getLoopLatch();
+
+  // Be wary, comparisons can be inverted, canonicalize order.
+  // If this 'element' comparison passed, we expect to proceed to the next elt.
+  if (CmpOfLoads.BCmpPred != ICmpInst::Predicate::ICMP_EQ)
+    std::swap(CmpLoop.HeaderBrEqualBB, CmpLoop.HeaderBrUnequalBB);
+  // The predicate on loop latch does not matter, just canonicalize some order.
+  if (CmpLoop.LatchBrContinueBB != LoopHeaderBB)
+    std::swap(CmpLoop.LatchBrFinishBB, CmpLoop.LatchBrContinueBB);
+
+  // Check that control-flow between blocks is as expected.
+  if (CmpLoop.HeaderBrEqualBB != LoopLatchBB ||
+      CmpLoop.LatchBrContinueBB != LoopHeaderBB) {
+    LLVM_DEBUG(dbgs() << "Loop control-flow not recognized.\n");
+    return false;
+  }
+
+  SmallVector<BasicBlock *, 2> ExitBlocks;
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+  assert(ExitBlocks.size() <= 2U && "Can't have more than two exit blocks.");
+
+  assert(!is_contained(ExitBlocks, CmpLoop.HeaderBrEqualBB) &&
+         is_contained(ExitBlocks, CmpLoop.HeaderBrUnequalBB) &&
+         !is_contained(ExitBlocks, CmpLoop.LatchBrContinueBB) &&
+         is_contained(ExitBlocks, CmpLoop.LatchBrFinishBB) &&
+         "Unexpected exit edges.");
+
+  LLVM_DEBUG(dbgs() << "Recognized loop control-flow.\n");
+
+  LLVM_DEBUG(dbgs() << "Performing side-effect analysis on the loop.\n");
+  assert(CurLoop->isLCSSAForm(*DT) && "Should only get LCSSA-form loops here.");
+  // No loop instructions must be used outside of the loop. Since we are in
+  // LCSSA form, we only need to check successor block's PHI nodes's incoming
+  // values for incoming blocks that are the loop basic blocks.
+  for (const BasicBlock *ExitBB : ExitBlocks) {
+    for (const PHINode &PHI : ExitBB->phis()) {
+      for (const BasicBlock *LoopBB :
+           make_filter_range(PHI.blocks(), [this](BasicBlock *PredecessorBB) {
+             return CurLoop->contains(PredecessorBB);
+           })) {
+        const auto *I =
+            dyn_cast<Instruction>(PHI.getIncomingValueForBlock(LoopBB));
+        if (I && CurLoop->contains(I)) {
+          LLVM_DEBUG(dbgs()
+                     << "Loop contains instruction " << *I
+                     << "   which is used outside of the loop in basic block  "
+                     << ExitBB->getName() << "  in phi node  " << PHI << "\n");
+          return false;
+        }
+      }
+    }
+  }
+  // Similarly, the loop should not have any other observable side-effects
+  // other than the final comparison result.
+  for (BasicBlock *LoopBB : CurLoop->blocks()) {
+    for (Instruction &I : *LoopBB) {
+      if (isa<DbgInfoIntrinsic>(I)) // Ignore dbginfo.
+        continue;                   // FIXME: anything else? lifetime info?
+      if ((I.mayHaveSideEffects() || I.isAtomic() || I.isFenceLike()) &&
+          &I != CmpOfLoads.LoadA && &I != CmpOfLoads.LoadB) {
+        LLVM_DEBUG(
+            dbgs() << "Loop contains instruction with potential side-effects: "
+                   << I << "\n");
+        return false;
+      }
+    }
+  }
+  LLVM_DEBUG(dbgs() << "No loop instructions deemed to have side-effects.\n");
+  return true;
+}
+
+bool LoopIdiomRecognize::recognizeBCmpLoopSCEV(uint64_t BCmpTyBytes,
+                                               CmpOfLoads &CmpOfLoads,
+                                               const SCEV *&SrcA,
+                                               const SCEV *&SrcB,
+                                               const SCEV *&Iterations) const {
+  // Try to compute SCEV of the loads, for this loop's scope.
+  const auto *ScevForSrcA = dyn_cast<SCEVAddRecExpr>(
+      SE->getSCEVAtScope(CmpOfLoads.LoadSrcA, CurLoop));
+  const auto *ScevForSrcB = dyn_cast<SCEVAddRecExpr>(
+      SE->getSCEVAtScope(CmpOfLoads.LoadSrcB, CurLoop));
+  if (!ScevForSrcA || !ScevForSrcB) {
+    LLVM_DEBUG(dbgs() << "Failed to get SCEV expressions for load sources.\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Got SCEV expressions (at loop scope) for loads:\n\t"
+                    << *ScevForSrcA << "\n\t" << *ScevForSrcB << "\n");
+
+  // Loads must have folloving SCEV exprs:  {%ptr,+,BCmpTyBytes}<%LoopHeaderBB>
+  const SCEV *RecStepForA = ScevForSrcA->getStepRecurrence(*SE);
+  const SCEV *RecStepForB = ScevForSrcB->getStepRecurrence(*SE);
+  if (!ScevForSrcA->isAffine() || !ScevForSrcB->isAffine() ||
+      ScevForSrcA->getLoop() != CurLoop || ScevForSrcB->getLoop() != CurLoop ||
+      RecStepForA != RecStepForB || !isa<SCEVConstant>(RecStepForA) ||
+      cast<SCEVConstant>(RecStepForA)->getAPInt() != BCmpTyBytes) {
+    LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads. Only support "
+                         "affine SCEV expressions originating in the loop we "
+                         "are analysing with identical constant positive step, "
+                         "equal to the count of bytes compared. Got:\n\t"
+                      << *RecStepForA << "\n\t" << *RecStepForB << "\n");
+    return false;
+    // FIXME: can support BCmpTyBytes > Step.
+    // But will need to account for the extra bytes compared at the end.
+  }
+
+  SrcA = ScevForSrcA->getStart();
+  SrcB = ScevForSrcB->getStart();
+  LLVM_DEBUG(dbgs() << "Got SCEV expressions for load sources:\n\t" << *SrcA
+                    << "\n\t" << *SrcB << "\n");
+
+  // The load sources must be loop-invants that dominate the loop header.
+  if (SrcA == SE->getCouldNotCompute() || SrcB == SE->getCouldNotCompute() ||
+      !SE->isAvailableAtLoopEntry(SrcA, CurLoop) ||
+      !SE->isAvailableAtLoopEntry(SrcB, CurLoop)) {
+    LLVM_DEBUG(dbgs() << "Unsupported SCEV expressions for loads, unavaliable "
+                         "prior to loop header.\n");
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "SCEV expressions for loads are acceptable.\n");
+
+  // For how many iterations is loop guaranteed not to exit via LoopLatch?
+  // This is one less than the maximal number of comparisons,and is:  n + -1
+  const SCEV *LoopExitCount =
+      SE->getExitCount(CurLoop, CurLoop->getLoopLatch());
+  LLVM_DEBUG(dbgs() << "Got SCEV expression for loop latch exit count: "
+                    << *LoopExitCount << "\n");
+  // Exit count, similarly, must be loop-invant that dominates the loop header.
+  if (LoopExitCount == SE->getCouldNotCompute() ||
+      !LoopExitCount->getType()->isIntOrPtrTy() ||
+      !SE->isAvailableAtLoopEntry(LoopExitCount, CurLoop)) {
+    LLVM_DEBUG(dbgs() << "Unsupported SCEV expression for loop latch exit.\n");
+    return false;
+  }
+
+  // LoopExitCount is always one less than the actual count of iterations.
+  // Do this before cast, else we will be stuck with   1 + zext(-1 + n)
+  Iterations = SE->getAddExpr(
+      LoopExitCount, SE->getOne(LoopExitCount->getType()), SCEV::FlagNUW);
+  assert(Iterations != SE->getCouldNotCompute() &&
+         "Shouldn't fail to increment by one.");
+
+  LLVM_DEBUG(dbgs() << "Computed iteration count: " << *Iterations << "\n");
+  return true;
+}
+
+/// Return true iff the bcmp idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p BCmpInst is set to the root byte-comparison instruction.
+/// 2) \p LatchCmpInst is set to the comparison that controls the latch.
+/// 3) \p LoadA is set to the first  LoadInst.
+/// 4) \p LoadB is set to the second LoadInst.
+/// 5) \p SrcA is set to the first  source location that is being compared.
+/// 6) \p SrcB is set to the second source location that is being compared.
+/// 7) \p NBytes is set to the number of bytes to compare.
+bool LoopIdiomRecognize::detectBCmpIdiom(ICmpInst *&BCmpInst,
+                                         CmpInst *&LatchCmpInst,
+                                         LoadInst *&LoadA, LoadInst *&LoadB,
+                                         const SCEV *&SrcA, const SCEV *&SrcB,
+                                         const SCEV *&NBytes) const {
+  LLVM_DEBUG(dbgs() << "Recognizing bcmp idiom\n");
+
+  // Give up if the loop is not in normal form, or has more than 2 blocks.
+  if (!CurLoop->isLoopSimplifyForm() || CurLoop->getNumBlocks() > 2) {
+    LLVM_DEBUG(dbgs() << "Basic loop structure unrecognized.\n");
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "Recognized basic loop structure.\n");
+
+  CmpLoopStructure CmpLoop;
+  if (!matchBCmpLoopStructure(CmpLoop))
+    return false;
+
+  CmpOfLoads CmpOfLoads;
+  if (!matchBCmpOfLoads(CmpLoop.BCmpValue, CmpOfLoads))
+    return false;
+
+  if (!recognizeBCmpLoopControlFlow(CmpOfLoads, CmpLoop))
+    return false;
+
+  BCmpInst = cast<ICmpInst>(CmpLoop.BCmpValue);        // FIXME: is there no
+  LatchCmpInst = cast<CmpInst>(CmpLoop.LatchCmpValue); // way to combine
+  LoadA = cast<LoadInst>(CmpOfLoads.LoadA);            // these cast with
+  LoadB = cast<LoadInst>(CmpOfLoads.LoadB);            // m_Value() matcher?
+
+  Type *BCmpValTy = BCmpInst->getOperand(0)->getType();
+  LLVMContext &Context = BCmpValTy->getContext();
+  uint64_t BCmpTyBits = DL->getTypeSizeInBits(BCmpValTy);
+  static constexpr uint64_t ByteTyBits = 8;
+
+  LLVM_DEBUG(dbgs() << "Got comparison between values of type " << *BCmpValTy
+                    << " of size " << BCmpTyBits
+                    << " bits (while byte = " << ByteTyBits << " bits).\n");
+  // bcmp()/memcmp() minimal unit of work is a byte. Therefore we must check
+  // that we are dealing with a multiple of a byte here.
+  if (BCmpTyBits % ByteTyBits != 0) {
+    LLVM_DEBUG(dbgs() << "Value size is not a multiple of byte.\n");
+    return false;
+    // FIXME: could still be done under a run-time check that the total bit
+    // count is a multiple of a byte i guess? Or handle remainder separately?
+  }
+
+  // Each comparison is done on this many bytes.
+  uint64_t BCmpTyBytes = BCmpTyBits / ByteTyBits;
+  LLVM_DEBUG(dbgs() << "Size is exactly " << BCmpTyBytes
+                    << " bytes, eligible for bcmp conversion.\n");
+
+  const SCEV *Iterations;
+  if (!recognizeBCmpLoopSCEV(BCmpTyBytes, CmpOfLoads, SrcA, SrcB, Iterations))
+    return false;
+
+  // bcmp / memcmp take length argument as size_t, do promotion now.
+  Type *CmpFuncSizeTy = DL->getIntPtrType(Context);
+  Iterations = SE->getNoopOrZeroExtend(Iterations, CmpFuncSizeTy);
+  assert(Iterations != SE->getCouldNotCompute() && "Promotion failed.");
+  // Note that it didn't do ptrtoint cast, we will need to do it manually.
+
+  // We will be comparing *bytes*, not BCmpTy, we need to recalculate size.
+  // It's a multiplication, and it *could* overflow. But for it to overflow
+  // we'd want to compare more bytes than could be represented by size_t, But
+  // allocation functions also take size_t. So how'd you produce such buffer?
+  // FIXME: we likely need to actually check that we know this won't overflow,
+  //        via llvm::computeOverflowForUnsignedMul().
+  NBytes = SE->getMulExpr(
+      Iterations, SE->getConstant(CmpFuncSizeTy, BCmpTyBytes), SCEV::FlagNUW);
+  assert(NBytes != SE->getCouldNotCompute() &&
+         "Shouldn't fail to increment by one.");
+
+  LLVM_DEBUG(dbgs() << "Computed total byte count: " << *NBytes << "\n");
+
+  if (LoadA->getPointerAddressSpace() != LoadB->getPointerAddressSpace() ||
+      LoadA->getPointerAddressSpace() != 0 || !LoadA->isSimple() ||
+      !LoadB->isSimple()) {
+    StringLiteral L("Unsupported loads in idiom - only support identical, "
+                    "simple loads from address space 0.\n");
+    LLVM_DEBUG(dbgs() << L);
+    ORE.emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "BCmpIdiomUnsupportedLoads",
+                                      BCmpInst->getDebugLoc(),
+                                      CurLoop->getHeader())
+             << L;
+    });
+    return false; // FIXME
+  }
+
+  LLVM_DEBUG(dbgs() << "Recognized bcmp idiom\n");
+  ORE.emit([&]() {
+    return OptimizationRemarkAnalysis(DEBUG_TYPE, "RecognizedBCmpIdiom",
+                                      CurLoop->getStartLoc(),
+                                      CurLoop->getHeader())
+           << "Loop recognized as a bcmp idiom";
+  });
+
+  return true;
+}
+
+BasicBlock *
+LoopIdiomRecognize::transformBCmpControlFlow(ICmpInst *ComparedEqual) {
+  LLVM_DEBUG(dbgs() << "Transforming control-flow.\n");
+  SmallVector<DominatorTree::UpdateType, 8> DTUpdates;
+
+  BasicBlock *PreheaderBB = CurLoop->getLoopPreheader();
+  BasicBlock *HeaderBB = CurLoop->getHeader();
+  BasicBlock *LoopLatchBB = CurLoop->getLoopLatch();
+  SmallString<32> LoopName = CurLoop->getName();
+  Function *Func = PreheaderBB->getParent();
+  LLVMContext &Context = Func->getContext();
+
+  // Before doing anything, drop SCEV info.
+  SE->forgetLoop(CurLoop);
+
+  // Here we start with: (0/6)
+  //  PreheaderBB: <preheader>        ; preds = ???
+  //    <...>
+  //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+  //    %ComparedEqual = icmp eq <...> %memcmp, 0
+  //    br label %LoopHeaderBB
+  //  LoopHeaderBB: <header,exiting>  ; preds = %PreheaderBB,%LoopLatchBB
+  //    <...>
+  //    br i1 %<...>, label %LoopLatchBB, label %Successor0BB
+  //  LoopLatchBB: <latch,exiting>    ; preds = %LoopHeaderBB
+  //    <...>
+  //    br i1 %<...>, label %Successor1BB, label %LoopHeaderBB
+  //  Successor0BB: <exit>            ; preds = %LoopHeaderBB
+  //    %S0PHI = phi <...> [ <...>, %LoopHeaderBB ]
+  //    <...>
+  //  Successor1BB: <exit>            ; preds = %LoopLatchBB
+  //    %S1PHI = phi <...> [ <...>, %LoopLatchBB ]
+  //    <...>
+  //
+  // Successor0 and Successor1 may or may not be the same basic block.
+
+  // Decouple the edge between loop preheader basic block and loop header basic
+  // block. Thus the loop has become unreachable.
+  assert(cast<BranchInst>(PreheaderBB->getTerminator())->isUnconditional() &&
+         PreheaderBB->getTerminator()->getSuccessor(0) == HeaderBB &&
+         "Preheader bb must end with an unconditional branch to header bb.");
+  PreheaderBB->getTerminator()->eraseFromParent();
+  DTUpdates.push_back({DominatorTree::Delete, PreheaderBB, HeaderBB});
+
+  // Create a new preheader basic block before loop header basic block.
+  auto *PhonyPreheaderBB = BasicBlock::Create(
+      Context, LoopName + ".phonypreheaderbb", Func, HeaderBB);
+  // And insert an unconditional branch from phony preheader basic block to
+  // loop header basic block.
+  IRBuilder<>(PhonyPreheaderBB).CreateBr(HeaderBB);
+  DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB});
+
+  // Create a *single* new empty block that we will substitute as a
+  // successor basic block for the loop's exits. This one is temporary.
+  // Much like phony preheader basic block, it is not connected.
+  auto *PhonySuccessorBB =
+      BasicBlock::Create(Context, LoopName + ".phonysuccessorbb", Func,
+                         LoopLatchBB->getNextNode());
+  // That block must have *some* non-PHI instruction, or else deleteDeadLoop()
+  // will mess up cleanup of dbginfo, and verifier will complain.
+  IRBuilder<>(PhonySuccessorBB).CreateUnreachable();
+
+  // Create two new empty blocks that we will use to preserve the original
+  // loop exit control-flow, and preserve the incoming values in the PHI nodes
+  // in loop's successor exit blocks. These will live one.
+  auto *ComparedUnequalBB =
+      BasicBlock::Create(Context, ComparedEqual->getName() + ".unequalbb", Func,
+                         PhonySuccessorBB->getNextNode());
+  auto *ComparedEqualBB =
+      BasicBlock::Create(Context, ComparedEqual->getName() + ".equalbb", Func,
+                         PhonySuccessorBB->getNextNode());
+
+  // By now we have: (1/6)
+  //  PreheaderBB:                    ; preds = ???
+  //    <...>
+  //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+  //    %ComparedEqual = icmp eq <...> %memcmp, 0
+  //    [no terminator instruction!]
+  //  PhonyPreheaderBB: <preheader>   ; No preds, UNREACHABLE!
+  //    br label %LoopHeaderBB
+  //  LoopHeaderBB: <header,exiting>  ; preds = %PhonyPreheaderBB, %LoopLatchBB
+  //    <...>
+  //    br i1 %<...>, label %LoopLatchBB, label %Successor0BB
+  //  LoopLatchBB: <latch,exiting>    ; preds = %LoopHeaderBB
+  //    <...>
+  //    br i1 %<...>, label %Successor1BB, label %LoopHeaderBB
+  //  PhonySuccessorBB:               ; No preds, UNREACHABLE!
+  //    unreachable
+  //  EqualBB:                        ; No preds, UNREACHABLE!
+  //    [no terminator instruction!]
+  //  UnequalBB:                      ; No preds, UNREACHABLE!
+  //    [no terminator instruction!]
+  //  Successor0BB: <exit>            ; preds = %LoopHeaderBB
+  //    %S0PHI = phi <...> [ <...>, %LoopHeaderBB ]
+  //    <...>
+  //  Successor1BB: <exit>            ; preds = %LoopLatchBB
+  //    %S1PHI = phi <...> [ <...>, %LoopLatchBB ]
+  //    <...>
+
+  // What is the mapping/replacement basic block for exiting out of the loop
+  // from either of old's loop basic blocks?
+  auto GetReplacementBB = [this, ComparedEqualBB,
+                           ComparedUnequalBB](const BasicBlock *OldBB) {
+    assert(CurLoop->contains(OldBB) && "Only for loop's basic blocks.");
+    if (OldBB == CurLoop->getLoopLatch()) // "all elements compared equal".
+      return ComparedEqualBB;
+    if (OldBB == CurLoop->getHeader()) // "element compared unequal".
+      return ComparedUnequalBB;
+    llvm_unreachable("Only had two basic blocks in loop.");
+  };
+
+  // What are the exits out of this loop?
+  SmallVector<Loop::Edge, 2> LoopExitEdges;
+  CurLoop->getExitEdges(LoopExitEdges);
+  assert(LoopExitEdges.size() == 2 && "Should have only to two exit edges.");
+
+  // Populate new basic blocks, update the exiting control-flow, PHI nodes.
+  for (const Loop::Edge &Edge : LoopExitEdges) {
+    auto *OldLoopBB = const_cast<BasicBlock *>(Edge.first);
+    auto *SuccessorBB = const_cast<BasicBlock *>(Edge.second);
+    assert(CurLoop->contains(OldLoopBB) && !CurLoop->contains(SuccessorBB) &&
+           "Unexpected edge.");
+
+    // If we would exit the loop from this loop's basic block,
+    // what semantically would that mean? Did comparison succeed or fail?
+    BasicBlock *NewBB = GetReplacementBB(OldLoopBB);
+    assert(NewBB->empty() && "Should not get same new basic block here twice.");
+    IRBuilder<> Builder(NewBB);
+    Builder.SetCurrentDebugLocation(OldLoopBB->getTerminator()->getDebugLoc());
+    Builder.CreateBr(SuccessorBB);
+    DTUpdates.push_back({DominatorTree::Insert, NewBB, SuccessorBB});
+    // Also, be *REALLY* careful with PHI nodes in successor basic block,
+    // update them to recieve the same input value, but not from current loop's
+    // basic block, but from new basic block instead.
+    SuccessorBB->replacePhiUsesWith(OldLoopBB, NewBB);
+    // Also, change loop control-flow. This loop's basic block shall no longer
+    // exit from the loop to it's original successor basic block, but to our new
+    // phony successor basic block. Note that new successor will be unique exit.
+    OldLoopBB->getTerminator()->replaceSuccessorWith(SuccessorBB,
+                                                     PhonySuccessorBB);
+    DTUpdates.push_back({DominatorTree::Delete, OldLoopBB, SuccessorBB});
+    DTUpdates.push_back({DominatorTree::Insert, OldLoopBB, PhonySuccessorBB});
+  }
+
+  // Inform DomTree about edge changes. Note that LoopInfo is still out-of-date.
+  assert(DTUpdates.size() == 8 && "Update count prediction failed.");
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  DTU.applyUpdates(DTUpdates);
+  DTUpdates.clear();
+
+  // By now we have: (2/6)
+  //  PreheaderBB:                    ; preds = ???
+  //    <...>
+  //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+  //    %ComparedEqual = icmp eq <...> %memcmp, 0
+  //    [no terminator instruction!]
+  //  PhonyPreheaderBB: <preheader>   ; No preds, UNREACHABLE!
+  //    br label %LoopHeaderBB
+  //  LoopHeaderBB: <header,exiting>  ; preds = %PhonyPreheaderBB, %LoopLatchBB
+  //    <...>
+  //    br i1 %<...>, label %LoopLatchBB, label %PhonySuccessorBB
+  //  LoopLatchBB: <latch,exiting>    ; preds = %LoopHeaderBB
+  //    <...>
+  //    br i1 %<...>, label %PhonySuccessorBB, label %LoopHeaderBB
+  //  PhonySuccessorBB: <uniq. exit>  ; preds = %LoopHeaderBB, %LoopLatchBB
+  //    unreachable
+  //  EqualBB:                        ; No preds, UNREACHABLE!
+  //    br label %Successor1BB
+  //  UnequalBB:                      ; No preds, UNREACHABLE!
+  //    br label %Successor0BB
+  //  Successor0BB:                   ; preds = %UnequalBB
+  //    %S0PHI = phi <...> [ <...>, %UnequalBB ]
+  //    <...>
+  //  Successor1BB:                   ; preds = %EqualBB
+  //    %S0PHI = phi <...> [ <...>, %EqualBB ]
+  //    <...>
+
+  // *Finally*, zap the original loop. Record it's parent loop though.
+  Loop *ParentLoop = CurLoop->getParentLoop();
+  LLVM_DEBUG(dbgs() << "Deleting old loop.\n");
+  LoopDeleter.markLoopAsDeleted(CurLoop); // Mark as deleted *BEFORE* deleting!
+  deleteDeadLoop(CurLoop, DT, SE, LI);    // And actually delete the loop.
+  CurLoop = nullptr;
+
+  // By now we have: (3/6)
+  //  PreheaderBB:                    ; preds = ???
+  //    <...>
+  //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+  //    %ComparedEqual = icmp eq <...> %memcmp, 0
+  //    [no terminator instruction!]
+  //  PhonyPreheaderBB:               ; No preds, UNREACHABLE!
+  //    br label %PhonySuccessorBB
+  //  PhonySuccessorBB:               ; preds = %PhonyPreheaderBB
+  //    unreachable
+  //  EqualBB:                        ; No preds, UNREACHABLE!
+  //    br label %Successor1BB
+  //  UnequalBB:                      ; No preds, UNREACHABLE!
+  //    br label %Successor0BB
+  //  Successor0BB:                   ; preds = %UnequalBB
+  //    %S0PHI = phi <...> [ <...>, %UnequalBB ]
+  //    <...>
+  //  Successor1BB:                   ; preds = %EqualBB
+  //    %S0PHI = phi <...> [ <...>, %EqualBB ]
+  //    <...>
+
+  // Now, actually restore the CFG.
+
+  // Insert an unconditional branch from an actual preheader basic block to
+  // phony preheader basic block.
+  IRBuilder<>(PreheaderBB).CreateBr(PhonyPreheaderBB);
+  DTUpdates.push_back({DominatorTree::Insert, PhonyPreheaderBB, HeaderBB});
+  // Insert proper conditional branch from phony successor basic block to the
+  // "dispatch" basic blocks, which were used to preserve incoming values in
+  // original loop's successor basic blocks.
+  assert(isa<UnreachableInst>(PhonySuccessorBB->getTerminator()) &&
+         "Yep, that's the one we created to keep deleteDeadLoop() happy.");
+  PhonySuccessorBB->getTerminator()->eraseFromParent();
+  {
+    IRBuilder<> Builder(PhonySuccessorBB);
+    Builder.SetCurrentDebugLocation(ComparedEqual->getDebugLoc());
+    Builder.CreateCondBr(ComparedEqual, ComparedEqualBB, ComparedUnequalBB);
+  }
+  DTUpdates.push_back(
+      {DominatorTree::Insert, PhonySuccessorBB, ComparedEqualBB});
+  DTUpdates.push_back(
+      {DominatorTree::Insert, PhonySuccessorBB, ComparedUnequalBB});
+
+  BasicBlock *DispatchBB = PhonySuccessorBB;
+  DispatchBB->setName(LoopName + ".bcmpdispatchbb");
+
+  assert(DTUpdates.size() == 3 && "Update count prediction failed.");
+  DTU.applyUpdates(DTUpdates);
+  DTUpdates.clear();
+
+  // By now we have: (4/6)
+  //  PreheaderBB:                    ; preds = ???
+  //    <...>
+  //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+  //    %ComparedEqual = icmp eq <...> %memcmp, 0
+  //    br label %PhonyPreheaderBB
+  //  PhonyPreheaderBB:               ; preds = %PreheaderBB
+  //    br label %DispatchBB
+  //  DispatchBB:                     ; preds = %PhonyPreheaderBB
+  //    br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
+  //  EqualBB:                        ; preds = %DispatchBB
+  //    br label %Successor1BB
+  //  UnequalBB:                      ; preds = %DispatchBB
+  //    br label %Successor0BB
+  //  Successor0BB:                   ; preds = %UnequalBB
+  //    %S0PHI = phi <...> [ <...>, %UnequalBB ]
+  //    <...>
+  //  Successor1BB:                   ; preds = %EqualBB
+  //    %S0PHI = phi <...> [ <...>, %EqualBB ]
+  //    <...>
+
+  // The basic CFG has been restored! Now let's merge redundant basic blocks.
+
+  // Merge phony successor basic block into it's only predecessor,
+  // phony preheader basic block. It is fully pointlessly redundant.
+  MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU);
+
+  // By now we have: (5/6)
+  //  PreheaderBB:                    ; preds = ???
+  //    <...>
+  //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+  //    %ComparedEqual = icmp eq <...> %memcmp, 0
+  //    br label %DispatchBB
+  //  DispatchBB:                     ; preds = %PreheaderBB
+  //    br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
+  //  EqualBB:                        ; preds = %DispatchBB
+  //    br label %Successor1BB
+  //  UnequalBB:                      ; preds = %DispatchBB
+  //    br label %Successor0BB
+  //  Successor0BB:                   ; preds = %UnequalBB
+  //    %S0PHI = phi <...> [ <...>, %UnequalBB ]
+  //    <...>
+  //  Successor1BB:                   ; preds = %EqualBB
+  //    %S0PHI = phi <...> [ <...>, %EqualBB ]
+  //    <...>
+
+  // Was this loop nested?
+  if (!ParentLoop) {
+    // If the loop was *NOT* nested, then let's also merge phony successor
+    // basic block into it's only predecessor, preheader basic block.
+    // Also, here we need to update LoopInfo.
+    LI->removeBlock(PreheaderBB);
+    MergeBasicBlockIntoOnlyPred(DispatchBB, &DTU);
+
+    // By now we have: (6/6)
+    //  DispatchBB:                   ; preds = ???
+    //    <...>
+    //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+    //    %ComparedEqual = icmp eq <...> %memcmp, 0
+    //    br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
+    //  EqualBB:                      ; preds = %DispatchBB
+    //    br label %Successor1BB
+    //  UnequalBB:                    ; preds = %DispatchBB
+    //    br label %Successor0BB
+    //  Successor0BB:                 ; preds = %UnequalBB
+    //    %S0PHI = phi <...> [ <...>, %UnequalBB ]
+    //    <...>
+    //  Successor1BB:                 ; preds = %EqualBB
+    //    %S0PHI = phi <...> [ <...>, %EqualBB ]
+    //    <...>
+
+    return DispatchBB;
+  }
+
+  // Otherwise, we need to "preserve" the LoopSimplify form of the deleted loop.
+  // To achieve that, we shall keep the preheader basic block (mainly so that
+  // the loop header block will be guaranteed to have a predecessor outside of
+  // the loop), and create a phony loop with all these new three basic blocks.
+  Loop *PhonyLoop = LI->AllocateLoop();
+  ParentLoop->addChildLoop(PhonyLoop);
+  PhonyLoop->addBasicBlockToLoop(DispatchBB, *LI);
+  PhonyLoop->addBasicBlockToLoop(ComparedEqualBB, *LI);
+  PhonyLoop->addBasicBlockToLoop(ComparedUnequalBB, *LI);
+
+  // But we only have a preheader basic block, a header basic block block and
+  // two exiting basic blocks. For a proper loop we also need a backedge from
+  // non-header basic block to header bb.
+  // Let's just add a never-taken branch from both of the exiting basic blocks.
+  for (BasicBlock *BB : {ComparedEqualBB, ComparedUnequalBB}) {
+    BranchInst *OldTerminator = cast<BranchInst>(BB->getTerminator());
+    assert(OldTerminator->isUnconditional() && "That's the one we created.");
+    BasicBlock *SuccessorBB = OldTerminator->getSuccessor(0);
+
+    IRBuilder<> Builder(OldTerminator);
+    Builder.SetCurrentDebugLocation(OldTerminator->getDebugLoc());
+    Builder.CreateCondBr(ConstantInt::getTrue(Context), SuccessorBB,
+                         DispatchBB);
+    OldTerminator->eraseFromParent();
+    // Yes, the backedge will never be taken. The control-flow is redundant.
+    // If it can be simplified further, other passes will take care.
+    DTUpdates.push_back({DominatorTree::Delete, BB, SuccessorBB});
+    DTUpdates.push_back({DominatorTree::Insert, BB, SuccessorBB});
+    DTUpdates.push_back({DominatorTree::Insert, BB, DispatchBB});
+  }
+  assert(DTUpdates.size() == 6 && "Update count prediction failed.");
+  DTU.applyUpdates(DTUpdates);
+  DTUpdates.clear();
+
+  // By now we have: (6/6)
+  //  PreheaderBB: <preheader>        ; preds = ???
+  //    <...>
+  //    %memcmp = call i32 @memcmp(i8* %LoadSrcA, i8* %LoadSrcB, i64 %Nbytes)
+  //    %ComparedEqual = icmp eq <...> %memcmp, 0
+  //    br label %BCmpDispatchBB
+  //  BCmpDispatchBB: <header>        ; preds = %PreheaderBB
+  //    br i1 %ComparedEqual, label %EqualBB, label %UnequalBB
+  //  EqualBB: <latch,exiting>        ; preds = %BCmpDispatchBB
+  //    br i1 %true, label %Successor1BB, label %BCmpDispatchBB
+  //  UnequalBB: <latch,exiting>      ; preds = %BCmpDispatchBB
+  //    br i1 %true, label %Successor0BB, label %BCmpDispatchBB
+  //  Successor0BB:                   ; preds = %UnequalBB
+  //    %S0PHI = phi <...> [ <...>, %UnequalBB ]
+  //    <...>
+  //  Successor1BB:                   ; preds = %EqualBB
+  //    %S0PHI = phi <...> [ <...>, %EqualBB ]
+  //    <...>
+
+  // Finally fully DONE!
+  return DispatchBB;
+}
+
+void LoopIdiomRecognize::transformLoopToBCmp(ICmpInst *BCmpInst,
+                                             CmpInst *LatchCmpInst,
+                                             LoadInst *LoadA, LoadInst *LoadB,
+                                             const SCEV *SrcA, const SCEV *SrcB,
+                                             const SCEV *NBytes) {
+  // We will be inserting before the terminator instruction of preheader block.
+  IRBuilder<> Builder(CurLoop->getLoopPreheader()->getTerminator());
+
+  LLVM_DEBUG(dbgs() << "Transforming bcmp loop idiom into a call.\n");
+  LLVM_DEBUG(dbgs() << "Emitting new instructions.\n");
+
+  // Expand the SCEV expressions for both sources to compare, and produce value
+  // for the byte len (beware of Iterations potentially being a pointer, and
+  // account for element size being BCmpTyBytes bytes, which may be not 1 byte)
+  Value *PtrA, *PtrB, *Len;
+  {
+    SCEVExpander SExp(*SE, *DL, "LoopToBCmp");
+    SExp.setInsertPoint(&*Builder.GetInsertPoint());
+
+    auto HandlePtr = [&SExp](LoadInst *Load, const SCEV *Src) {
+      SExp.SetCurrentDebugLocation(DebugLoc());
+      // If the pointer operand of original load had dbgloc - use it.
+      if (const auto *I = dyn_cast<Instruction>(Load->getPointerOperand()))
+        SExp.SetCurrentDebugLocation(I->getDebugLoc());
+      return SExp.expandCodeFor(Src);
+    };
+    PtrA = HandlePtr(LoadA, SrcA);
+    PtrB = HandlePtr(LoadB, SrcB);
+
+    // For len calculation let's use dbgloc for the loop's latch condition.
+    Builder.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc());
+    SExp.SetCurrentDebugLocation(LatchCmpInst->getDebugLoc());
+    Len = SExp.expandCodeFor(NBytes);
+
+    Type *CmpFuncSizeTy = DL->getIntPtrType(Builder.getContext());
+    assert(SE->getTypeSizeInBits(Len->getType()) ==
+               DL->getTypeSizeInBits(CmpFuncSizeTy) &&
+           "Len should already have the correct size.");
+
+    // Make sure that iteration count is a number, insert ptrtoint cast if not.
+    if (Len->getType()->isPointerTy())
+      Len = Builder.CreatePtrToInt(Len, CmpFuncSizeTy);
+    assert(Len->getType() == CmpFuncSizeTy && "Should have correct type now.");
+
+    Len->setName(Len->getName() + ".bytecount");
+
+    // There is no legality check needed. We want to compare that the memory
+    // regions [PtrA, PtrA+Len) and [PtrB, PtrB+Len) are fully identical, equal.
+    // For them to be fully equal, they must match bit-by-bit. And likewise,
+    // for them to *NOT* be fully equal, they have to differ just by one bit.
+    // The step of comparison (bits compared at once) simply does not matter.
+  }
+
+  // For the rest of new instructions, dbgloc should point at the value cmp.
+  Builder.SetCurrentDebugLocation(BCmpInst->getDebugLoc());
+
+  // Emit the comparison itself.
+  auto *CmpCall =
+      cast<CallInst>(HasBCmp ? emitBCmp(PtrA, PtrB, Len, Builder, *DL, TLI)
+                             : emitMemCmp(PtrA, PtrB, Len, Builder, *DL, TLI));
+  // FIXME: add {B,Mem}CmpInst with MemoryCompareInst
+  //        (based on MemIntrinsicBase) as base?
+  // FIXME: propagate metadata from loads? (alignments, AS, TBAA, ...)
+
+  // {b,mem}cmp returned 0 if they were equal, or non-zero if not equal.
+  auto *ComparedEqual = cast<ICmpInst>(Builder.CreateICmpEQ(
+      CmpCall, ConstantInt::get(CmpCall->getType(), 0),
+      PtrA->getName() + ".vs." + PtrB->getName() + ".eqcmp"));
+
+  BasicBlock *BB = transformBCmpControlFlow(ComparedEqual);
+  Builder.ClearInsertionPoint();
+
+  // We're done.
+  LLVM_DEBUG(dbgs() << "Transformed loop bcmp idiom into a call.\n");
+  ORE.emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "TransformedBCmpIdiomToCall",
+                              CmpCall->getDebugLoc(), BB)
+           << "Transformed bcmp idiom into a call to "
+           << ore::NV("NewFunction", CmpCall->getCalledFunction())
+           << "() function";
+  });
+  ++NumBCmp;
+}
+
+/// Recognizes a bcmp idiom in a non-countable loop.
+///
+/// If detected, transforms the relevant code to issue the bcmp (or memcmp)
+/// intrinsic function call, and returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeBCmp() {
+  if (!HasMemCmp && !HasBCmp)
+    return false;
+
+  ICmpInst *BCmpInst;
+  CmpInst *LatchCmpInst;
+  LoadInst *LoadA, *LoadB;
+  const SCEV *SrcA, *SrcB, *NBytes;
+  if (!detectBCmpIdiom(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB,
+                       NBytes)) {
+    LLVM_DEBUG(dbgs() << "bcmp idiom recognition failed.\n");
+    return false;
+  }
+
+  transformLoopToBCmp(BCmpInst, LatchCmpInst, LoadA, LoadB, SrcA, SrcB, NBytes);
+  return true;
+}
--- a/test/Transforms/LoopIdiom/bcmp-basic.ll
+++ b/test/Transforms/LoopIdiom/bcmp-basic.ll
--- a/test/Transforms/LoopIdiom/bcmp-debugify-remarks.ll
+++ b/test/Transforms/LoopIdiom/bcmp-debugify-remarks.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -debugify -loop-idiom < %s -S 2>&1 | FileCheck %s
+; RUN: opt -debugify -loop-idiom -pass-remarks=loop-idiom -pass-remarks-analysis=loop-idiom -verify -verify-each -verify-dom-info -verify-loop-info < %s -S 2>&1 | FileCheck %s

 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"

@ -23,38 +23,37 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;     sink(std::equal(ptr0[i], ptr0[i] + count[i], ptr1[i]));
 ; }

+; CHECK: remark: <stdin>:13:1: Loop recognized as a bcmp idiom
+; CHECK: remark: <stdin>:11:1: Transformed bcmp idiom into a call to memcmp() function
+; CHECK: remark: <stdin>:29:1: Loop recognized as a bcmp idiom
+; CHECK: remark: <stdin>:34:1: Transformed bcmp idiom into a call to memcmp() function
+
 define i1 @_Z43index_iteration_eq_variable_size_no_overlapPKcm(i8* nocapture %ptr, i64 %count) {
 ; CHECK-LABEL: @_Z43index_iteration_eq_variable_size_no_overlapPKcm(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[COUNT:%.*]], !dbg !22
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[COUNT_BYTECOUNT:%.*]], !dbg !22
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8* [[ADD_PTR]], metadata !9, metadata !DIExpression()), !dbg !22
-; CHECK-NEXT:    [[CMP14:%.*]] = icmp eq i64 [[COUNT]], 0, !dbg !23
+; CHECK-NEXT:    [[CMP14:%.*]] = icmp eq i64 [[COUNT_BYTECOUNT]], 0, !dbg !23
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i1 [[CMP14]], metadata !11, metadata !DIExpression()), !dbg !23
-; CHECK-NEXT:    br i1 [[CMP14]], label [[CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]], !dbg !24
-; CHECK:       for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]], !dbg !25
-; CHECK:       for.cond:
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INC:%.*]], [[COUNT]], !dbg !26
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i1 [[CMP]], metadata !13, metadata !DIExpression()), !dbg !26
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[CLEANUP_LOOPEXIT:%.*]], !dbg !27
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_015:%.*]] = phi i64 [ [[INC]], [[FOR_COND:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ], !dbg !28
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[I_015]], metadata !14, metadata !DIExpression()), !dbg !28
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[I_015]], !dbg !29
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8* [[ARRAYIDX]], metadata !15, metadata !DIExpression()), !dbg !29
-; CHECK-NEXT:    [[V0:%.*]] = load i8, i8* [[ARRAYIDX]], !dbg !30
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8 [[V0]], metadata !16, metadata !DIExpression()), !dbg !30
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 [[I_015]], !dbg !31
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8* [[ARRAYIDX1]], metadata !17, metadata !DIExpression()), !dbg !31
-; CHECK-NEXT:    [[V1:%.*]] = load i8, i8* [[ARRAYIDX1]], !dbg !32
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8 [[V1]], metadata !18, metadata !DIExpression()), !dbg !32
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[V0]], [[V1]], !dbg !33
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i1 [[CMP3]], metadata !19, metadata !DIExpression()), !dbg !33
-; CHECK-NEXT:    [[INC]] = add nuw i64 [[I_015]], 1, !dbg !34
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[INC]], metadata !20, metadata !DIExpression()), !dbg !34
-; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_COND]], label [[CLEANUP_LOOPEXIT]], !dbg !25
+; CHECK-NEXT:    br i1 [[CMP14]], label [[CLEANUP:%.*]], label [[FOR_BODY_BCMPDISPATCHBB:%.*]], !dbg !24
+; CHECK:       for.body.bcmpdispatchbb:
+; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[PTR]], i8* [[ADD_PTR]], i64 [[COUNT_BYTECOUNT]]), !dbg !25
+; CHECK-NEXT:    [[PTR_VS_ADD_PTR_EQCMP:%.*]] = icmp eq i32 [[MEMCMP]], 0, !dbg !25
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !14, metadata !DIExpression()), !dbg !26
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !15, metadata !DIExpression()), !dbg !27
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !16, metadata !DIExpression()), !dbg !28
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !17, metadata !DIExpression()), !dbg !29
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !18, metadata !DIExpression()), !dbg !30
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !19, metadata !DIExpression()), !dbg !25
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !20, metadata !DIExpression()), !dbg !31
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !13, metadata !DIExpression()), !dbg !32
+; CHECK-NEXT:    br i1 [[PTR_VS_ADD_PTR_EQCMP]], label [[PTR_VS_ADD_PTR_EQCMP_EQUALBB:%.*]], label [[PTR_VS_ADD_PTR_EQCMP_UNEQUALBB:%.*]], !dbg !25
+; CHECK:       ptr.vs.add.ptr.eqcmp.equalbb:
+; CHECK-NEXT:    br label [[CLEANUP_LOOPEXIT:%.*]], !dbg !33
+; CHECK:       ptr.vs.add.ptr.eqcmp.unequalbb:
+; CHECK-NEXT:    br label [[CLEANUP_LOOPEXIT]], !dbg !34
 ; CHECK:       cleanup.loopexit:
-; CHECK-NEXT:    [[RES_PH:%.*]] = phi i1 [ false, [[FOR_BODY]] ], [ true, [[FOR_COND]] ]
+; CHECK-NEXT:    [[RES_PH:%.*]] = phi i1 [ false, [[PTR_VS_ADD_PTR_EQCMP_UNEQUALBB]] ], [ true, [[PTR_VS_ADD_PTR_EQCMP_EQUALBB]] ]
 ; CHECK-NEXT:    br label [[CLEANUP]], !dbg !35
 ; CHECK:       cleanup:
 ; CHECK-NEXT:    [[RES:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ [[RES_PH]], [[CLEANUP_LOOPEXIT]] ], !dbg !36
@ -106,11 +105,11 @@ define void @_Z16loop_within_loopmPPKcS1_Pm(i64 %outer_count, i8** %ptr0, i8** %
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8* [[T0]], metadata !42, metadata !DIExpression()), !dbg !66
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[COUNT:%.*]], i64 [[I_012]], !dbg !67
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64* [[ARRAYIDX2]], metadata !43, metadata !DIExpression()), !dbg !67
-; CHECK-NEXT:    [[T1:%.*]] = load i64, i64* [[ARRAYIDX2]], !dbg !68
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[T1]], metadata !44, metadata !DIExpression()), !dbg !68
-; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 [[T1]], !dbg !69
+; CHECK-NEXT:    [[T1_BYTECOUNT:%.*]] = load i64, i64* [[ARRAYIDX2]], !dbg !68
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[T1_BYTECOUNT]], metadata !44, metadata !DIExpression()), !dbg !68
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[T0]], i64 [[T1_BYTECOUNT]], !dbg !69
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8* [[ADD_PTR]], metadata !45, metadata !DIExpression()), !dbg !69
-; CHECK-NEXT:    [[CMP5_I_I:%.*]] = icmp eq i64 [[T1]], 0, !dbg !70
+; CHECK-NEXT:    [[CMP5_I_I:%.*]] = icmp eq i64 [[T1_BYTECOUNT]], 0, !dbg !70
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i1 [[CMP5_I_I]], metadata !46, metadata !DIExpression()), !dbg !70
 ; CHECK-NEXT:    br i1 [[CMP5_I_I]], label [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT]], label [[FOR_BODY_I_I_PREHEADER:%.*]], !dbg !62
 ; CHECK:       for.body.i.i.preheader:
@ -118,39 +117,35 @@ define void @_Z16loop_within_loopmPPKcS1_Pm(i64 %outer_count, i8** %ptr0, i8** %
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8** [[ARRAYIDX3]], metadata !47, metadata !DIExpression()), !dbg !71
 ; CHECK-NEXT:    [[T2:%.*]] = load i8*, i8** [[ARRAYIDX3]], !dbg !72
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8* [[T2]], metadata !48, metadata !DIExpression()), !dbg !72
-; CHECK-NEXT:    br label [[FOR_BODY_I_I:%.*]], !dbg !73
-; CHECK:       for.body.i.i:
-; CHECK-NEXT:    [[__FIRST2_ADDR_07_I_I:%.*]] = phi i8* [ [[INCDEC_PTR1_I_I:%.*]], [[FOR_INC_I_I:%.*]] ], [ [[T2]], [[FOR_BODY_I_I_PREHEADER]] ], !dbg !74
-; CHECK-NEXT:    [[__FIRST1_ADDR_06_I_I:%.*]] = phi i8* [ [[INCDEC_PTR_I_I:%.*]], [[FOR_INC_I_I]] ], [ [[T0]], [[FOR_BODY_I_I_PREHEADER]] ], !dbg !75
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8* [[__FIRST2_ADDR_07_I_I]], metadata !49, metadata !DIExpression()), !dbg !74
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8* [[__FIRST1_ADDR_06_I_I]], metadata !50, metadata !DIExpression()), !dbg !75
-; CHECK-NEXT:    [[T3:%.*]] = load i8, i8* [[__FIRST1_ADDR_06_I_I]], !dbg !76
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8 [[T3]], metadata !51, metadata !DIExpression()), !dbg !76
-; CHECK-NEXT:    [[T4:%.*]] = load i8, i8* [[__FIRST2_ADDR_07_I_I]], !dbg !77
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8 [[T4]], metadata !52, metadata !DIExpression()), !dbg !77
-; CHECK-NEXT:    [[CMP_I_I_I:%.*]] = icmp eq i8 [[T3]], [[T4]], !dbg !78
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i1 [[CMP_I_I_I]], metadata !53, metadata !DIExpression()), !dbg !78
-; CHECK-NEXT:    br i1 [[CMP_I_I_I]], label [[FOR_INC_I_I]], label [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT_LOOPEXIT:%.*]], !dbg !79
-; CHECK:       for.inc.i.i:
-; CHECK-NEXT:    [[INCDEC_PTR_I_I]] = getelementptr inbounds i8, i8* [[__FIRST1_ADDR_06_I_I]], i64 1, !dbg !80
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8* [[INCDEC_PTR_I_I]], metadata !54, metadata !DIExpression()), !dbg !80
-; CHECK-NEXT:    [[INCDEC_PTR1_I_I]] = getelementptr inbounds i8, i8* [[__FIRST2_ADDR_07_I_I]], i64 1, !dbg !81
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i8* [[INCDEC_PTR1_I_I]], metadata !55, metadata !DIExpression()), !dbg !81
-; CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8* [[INCDEC_PTR_I_I]], [[ADD_PTR]], !dbg !82
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i1 [[CMP_I_I]], metadata !56, metadata !DIExpression()), !dbg !82
-; CHECK-NEXT:    br i1 [[CMP_I_I]], label [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT_LOOPEXIT]], label [[FOR_BODY_I_I]], !dbg !83
+; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[T0]], i8* [[T2]], i64 [[T1_BYTECOUNT]]), !dbg !73
+; CHECK-NEXT:    [[T0_VS_T2_EQCMP:%.*]] = icmp eq i32 [[MEMCMP]], 0, !dbg !73
+; CHECK-NEXT:    br label [[FOR_BODY_I_I_BCMPDISPATCHBB:%.*]]
+; CHECK:       for.body.i.i.bcmpdispatchbb:
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !49, metadata !DIExpression()), !dbg !74
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !50, metadata !DIExpression()), !dbg !75
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !51, metadata !DIExpression()), !dbg !76
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !52, metadata !DIExpression()), !dbg !77
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !53, metadata !DIExpression()), !dbg !73
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !54, metadata !DIExpression()), !dbg !78
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !55, metadata !DIExpression()), !dbg !79
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata !56, metadata !DIExpression()), !dbg !80
+; CHECK-NEXT:    br i1 [[T0_VS_T2_EQCMP]], label [[T0_VS_T2_EQCMP_EQUALBB:%.*]], label [[T0_VS_T2_EQCMP_UNEQUALBB:%.*]], !dbg !73
+; CHECK:       t0.vs.t2.eqcmp.equalbb:
+; CHECK-NEXT:    br i1 true, label [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT_LOOPEXIT:%.*]], label [[FOR_BODY_I_I_BCMPDISPATCHBB]], !dbg !81
+; CHECK:       t0.vs.t2.eqcmp.unequalbb:
+; CHECK-NEXT:    br i1 true, label [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT_LOOPEXIT]], label [[FOR_BODY_I_I_BCMPDISPATCHBB]], !dbg !82
 ; CHECK:       _ZNSt3__15equalIPKcS2_EEbT_S3_T0_.exit.loopexit:
-; CHECK-NEXT:    [[RETVAL_0_I_I_PH:%.*]] = phi i1 [ false, [[FOR_BODY_I_I]] ], [ true, [[FOR_INC_I_I]] ]
-; CHECK-NEXT:    br label [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT]], !dbg !84
+; CHECK-NEXT:    [[RETVAL_0_I_I_PH:%.*]] = phi i1 [ false, [[T0_VS_T2_EQCMP_UNEQUALBB]] ], [ true, [[T0_VS_T2_EQCMP_EQUALBB]] ]
+; CHECK-NEXT:    br label [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT]], !dbg !83
 ; CHECK:       _ZNSt3__15equalIPKcS2_EEbT_S3_T0_.exit:
-; CHECK-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i1 [ true, [[FOR_BODY]] ], [ [[RETVAL_0_I_I_PH]], [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT_LOOPEXIT]] ], !dbg !85
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i1 [[RETVAL_0_I_I]], metadata !57, metadata !DIExpression()), !dbg !85
-; CHECK-NEXT:    tail call void @_Z4sinkb(i1 [[RETVAL_0_I_I]]), !dbg !84
-; CHECK-NEXT:    [[INC]] = add nuw i64 [[I_012]], 1, !dbg !86
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[INC]], metadata !58, metadata !DIExpression()), !dbg !86
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INC]], [[OUTER_COUNT]], !dbg !87
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i1 [[CMP]], metadata !59, metadata !DIExpression()), !dbg !87
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg !88
+; CHECK-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i1 [ true, [[FOR_BODY]] ], [ [[RETVAL_0_I_I_PH]], [[_ZNST3__15EQUALIPKCS2_EEBT_S3_T0__EXIT_LOOPEXIT]] ], !dbg !84
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i1 [[RETVAL_0_I_I]], metadata !57, metadata !DIExpression()), !dbg !84
+; CHECK-NEXT:    tail call void @_Z4sinkb(i1 [[RETVAL_0_I_I]]), !dbg !83
+; CHECK-NEXT:    [[INC]] = add nuw i64 [[I_012]], 1, !dbg !85
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[INC]], metadata !58, metadata !DIExpression()), !dbg !85
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INC]], [[OUTER_COUNT]], !dbg !86
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i1 [[CMP]], metadata !59, metadata !DIExpression()), !dbg !86
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg !87
 ;
 entry:
  %cmp11 = icmp eq i64 %outer_count, 0
--- a/test/Transforms/LoopIdiom/bcmp-widening.ll
+++ b/test/Transforms/LoopIdiom/bcmp-widening.ll
@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -loop-idiom < %s -S | FileCheck %s
+; RUN: opt -loop-idiom -verify -verify-each -verify-dom-info -verify-loop-info < %s -S | FileCheck %s

 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"