[HeapProf] Clang and LLVM support for heap profiling instrumentation

See RFC for background: http://lists.llvm.org/pipermail/llvm-dev/2020-June/142744.html Note that the runtime changes will be sent separately (hopefully this week, need to add some tests). This patch includes the LLVM pass to instrument memory accesses with either inline sequences to increment the access count in the shadow location, or alternatively to call into the runtime. It also changes calls to memset/memcpy/memmove to the equivalent runtime version. The pass is modeled on the address sanitizer pass. The clang changes add the driver option to invoke the new pass, and to link with the upcoming heap profiling runtime libraries. Currently there is no attempt to optimize the instrumentation, e.g. to aggregate updates to the same memory allocation. That will be implemented as follow on work. Differential Revision: https://reviews.llvm.org/D85948
2025-01-31 12:41:49 +01:00 · 2020-08-13 16:29:38 -07:00 · 2020-08-13 16:29:38 -07:00 · bce40acc54
commit bce40acc54
parent 2088bfe3c4
12 changed files with 1184 additions and 0 deletions
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@ -176,6 +176,7 @@ void initializeGlobalSplitPass(PassRegistry&);
 void initializeGlobalsAAWrapperPassPass(PassRegistry&);
 void initializeGuardWideningLegacyPassPass(PassRegistry&);
 void initializeHardwareLoopsPass(PassRegistry&);
+void initializeHeapProfilerLegacyPassPass(PassRegistry &);
 void initializeHotColdSplittingLegacyPassPass(PassRegistry&);
 void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &);
 void initializeIPSCCPLegacyPassPass(PassRegistry&);
@ -303,6 +304,7 @@ void initializeMergeICmpsLegacyPassPass(PassRegistry &);
 void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);
 void initializeMetaRenamerPass(PassRegistry&);
 void initializeModuleDebugInfoPrinterPass(PassRegistry&);
+void initializeModuleHeapProfilerLegacyPassPass(PassRegistry &);
 void initializeModuleSummaryIndexWrapperPassPass(PassRegistry&);
 void initializeModuloScheduleTestPass(PassRegistry&);
 void initializeMustExecutePrinterPass(PassRegistry&);
--- a/include/llvm/Transforms/Instrumentation/HeapProfiler.h
+++ b/include/llvm/Transforms/Instrumentation/HeapProfiler.h
@ -0,0 +1,51 @@
+//===--------- Definition of the HeapProfiler class ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the HeapProfiler class.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_HEAPPROFILER_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_HEAPPROFILER_H
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+/// Public interface to the heap profiler pass for instrumenting code to
+/// profile heap memory accesses.
+///
+/// The profiler itself is a function pass that works by inserting various
+/// calls to the HeapProfiler runtime library functions. The runtime library
+/// essentially replaces malloc() and free() with custom implementations that
+/// record data about the allocations.
+class HeapProfilerPass : public PassInfoMixin<HeapProfilerPass> {
+public:
+  explicit HeapProfilerPass();
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// Public interface to the heap profiler module pass for instrumenting code
+/// to profile heap memory allocations and accesses.
+class ModuleHeapProfilerPass : public PassInfoMixin<ModuleHeapProfilerPass> {
+public:
+  explicit ModuleHeapProfilerPass();
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+};
+
+// Insert HeapProfiler instrumentation
+FunctionPass *createHeapProfilerFunctionPass();
+ModulePass *createModuleHeapProfilerLegacyPassPass();
+
+} // namespace llvm
+
+#endif
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@ -110,6 +110,7 @@
 #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
 #include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
 #include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
+#include "llvm/Transforms/Instrumentation/HeapProfiler.h"
 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
 #include "llvm/Transforms/Instrumentation/InstrProfiling.h"
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
@ -258,6 +259,10 @@ static cl::opt<bool>
                            cl::Hidden,
                            cl::desc("Enable inline deferral during PGO"));

+static cl::opt<bool> EnableHeapProfiler("enable-heap-prof", cl::init(false),
+                                        cl::Hidden, cl::ZeroOrMore,
+                                        cl::desc("Enable heap profiler"));
+
 PipelineTuningOptions::PipelineTuningOptions() {
  LoopInterleaving = true;
  LoopVectorization = true;
@ -1034,6 +1039,12 @@ ModulePassManager PassBuilder::buildModuleSimplificationPipeline(
    MPM.addPass(SyntheticCountsPropagation());

  MPM.addPass(buildInlinerPipeline(Level, Phase, DebugLogging));
+
+  if (EnableHeapProfiler && Phase != ThinLTOPhase::PreLink) {
+    MPM.addPass(createModuleToFunctionPassAdaptor(HeapProfilerPass()));
+    MPM.addPass(ModuleHeapProfilerPass());
+  }
+
  return MPM;
 }

--- a/lib/Passes/PassRegistry.def
+++ b/lib/Passes/PassRegistry.def
@ -97,6 +97,7 @@ MODULE_PASS("msan-module", MemorySanitizerPass({}))
 MODULE_PASS("tsan-module", ThreadSanitizerPass())
 MODULE_PASS("kasan-module", ModuleAddressSanitizerPass(/*CompileKernel=*/true, false, true, false))
 MODULE_PASS("sancov-module", ModuleSanitizerCoveragePass())
+MODULE_PASS("heapprof-module", ModuleHeapProfilerPass())
 MODULE_PASS("poison-checking", PoisonCheckingPass())
 #undef MODULE_PASS

@ -276,6 +277,7 @@ FUNCTION_PASS("kasan", AddressSanitizerPass(true, false, false))
 FUNCTION_PASS("msan", MemorySanitizerPass({}))
 FUNCTION_PASS("kmsan", MemorySanitizerPass({0, false, /*Kernel=*/true}))
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
+FUNCTION_PASS("heapprof", HeapProfilerPass())
 #undef FUNCTION_PASS

 #ifndef FUNCTION_PASS_WITH_PARAMS
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@ -5,6 +5,7 @@ add_llvm_component_library(LLVMInstrumentation
  ControlHeightReduction.cpp
  DataFlowSanitizer.cpp
  GCOVProfiling.cpp
+  HeapProfiler.cpp
  MemorySanitizer.cpp
  IndirectCallPromotion.cpp
  Instrumentation.cpp
--- a/lib/Transforms/Instrumentation/HeapProfiler.cpp
+++ b/lib/Transforms/Instrumentation/HeapProfiler.cpp
@ -0,0 +1,613 @@
+//===- HeapProfiler.cpp - heap allocation and access profiler
+//--------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of HeapProfiler. Memory accesses are instrumented
+// to increment the access count held in a shadow memory location, or
+// alternatively to call into the runtime. Memory intrinsic calls (memmove,
+// memcpy, memset) are changed to call the heap profiling runtime version
+// instead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/HeapProfiler.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "heapprof"
+
+constexpr int LLVM_HEAP_PROFILER_VERSION = 1;
+
+// Size of memory mapped to a single shadow location.
+constexpr uint64_t DefaultShadowGranularity = 64;
+
+// Scale from granularity down to shadow size.
+constexpr uint64_t DefaultShadowScale = 3;
+
+constexpr char HeapProfModuleCtorName[] = "heapprof.module_ctor";
+constexpr uint64_t HeapProfCtorAndDtorPriority = 1;
+// On Emscripten, the system needs more than one priorities for constructors.
+constexpr uint64_t HeapProfEmscriptenCtorAndDtorPriority = 50;
+constexpr char HeapProfInitName[] = "__heapprof_init";
+constexpr char HeapProfVersionCheckNamePrefix[] =
+    "__heapprof_version_mismatch_check_v";
+
+constexpr char HeapProfShadowMemoryDynamicAddress[] =
+    "__heapprof_shadow_memory_dynamic_address";
+
+// Command-line flags.
+
+static cl::opt<bool> ClInsertVersionCheck(
+    "heapprof-guard-against-version-mismatch",
+    cl::desc("Guard against compiler/runtime version mismatch."), cl::Hidden,
+    cl::init(true));
+
+// This flag may need to be replaced with -f[no-]memprof-reads.
+static cl::opt<bool> ClInstrumentReads("heapprof-instrument-reads",
+                                       cl::desc("instrument read instructions"),
+                                       cl::Hidden, cl::init(true));
+
+static cl::opt<bool>
+    ClInstrumentWrites("heapprof-instrument-writes",
+                       cl::desc("instrument write instructions"), cl::Hidden,
+                       cl::init(true));
+
+static cl::opt<bool> ClInstrumentAtomics(
+    "heapprof-instrument-atomics",
+    cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<bool> ClUseCalls(
+    "heapprof-use-callbacks",
+    cl::desc("Use callbacks instead of inline instrumentation sequences."),
+    cl::Hidden, cl::init(false));
+
+static cl::opt<std::string>
+    ClMemoryAccessCallbackPrefix("heapprof-memory-access-callback-prefix",
+                                 cl::desc("Prefix for memory access callbacks"),
+                                 cl::Hidden, cl::init("__heapprof_"));
+
+// These flags allow to change the shadow mapping.
+// The shadow mapping looks like
+//    Shadow = ((Mem & mask) >> scale) + offset
+
+static cl::opt<int> ClMappingScale("heapprof-mapping-scale",
+                                   cl::desc("scale of heapprof shadow mapping"),
+                                   cl::Hidden, cl::init(DefaultShadowScale));
+
+static cl::opt<int>
+    ClMappingGranularity("heapprof-mapping-granularity",
+                         cl::desc("granularity of heapprof shadow mapping"),
+                         cl::Hidden, cl::init(DefaultShadowGranularity));
+
+// Debug flags.
+
+static cl::opt<int> ClDebug("heapprof-debug", cl::desc("debug"), cl::Hidden,
+                            cl::init(0));
+
+static cl::opt<std::string> ClDebugFunc("heapprof-debug-func", cl::Hidden,
+                                        cl::desc("Debug func"));
+
+static cl::opt<int> ClDebugMin("heapprof-debug-min", cl::desc("Debug min inst"),
+                               cl::Hidden, cl::init(-1));
+
+static cl::opt<int> ClDebugMax("heapprof-debug-max", cl::desc("Debug max inst"),
+                               cl::Hidden, cl::init(-1));
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+
+namespace {
+
+/// This struct defines the shadow mapping using the rule:
+///   shadow = ((mem & mask) >> Scale) ADD DynamicShadowOffset.
+struct ShadowMapping {
+  ShadowMapping() {
+    Scale = ClMappingScale;
+    Granularity = ClMappingGranularity;
+    Mask = ~(Granularity - 1);
+  }
+
+  int Scale;
+  int Granularity;
+  uint64_t Mask; // Computed as ~(Granularity-1)
+};
+
+static uint64_t getCtorAndDtorPriority(Triple &TargetTriple) {
+  return TargetTriple.isOSEmscripten() ? HeapProfEmscriptenCtorAndDtorPriority
+                                       : HeapProfCtorAndDtorPriority;
+}
+
+struct InterestingMemoryAccess {
+  Value *Addr = nullptr;
+  bool IsWrite;
+  unsigned Alignment;
+  uint64_t TypeSize;
+  Value *MaybeMask = nullptr;
+};
+
+/// Instrument the code in module to profile heap accesses.
+class HeapProfiler {
+public:
+  HeapProfiler(Module &M) {
+    C = &(M.getContext());
+    LongSize = M.getDataLayout().getPointerSizeInBits();
+    IntptrTy = Type::getIntNTy(*C, LongSize);
+  }
+
+  /// If it is an interesting memory access, populate information
+  /// about the access and return a InterestingMemoryAccess struct.
+  /// Otherwise return None.
+  Optional<InterestingMemoryAccess> isInterestingMemoryAccess(Instruction *I);
+
+  void instrumentMop(Instruction *I, const DataLayout &DL,
+                     InterestingMemoryAccess &Access);
+  void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
+                         Value *Addr, uint32_t TypeSize, bool IsWrite);
+  void instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask,
+                                   Instruction *I, Value *Addr,
+                                   unsigned Alignment, uint32_t TypeSize,
+                                   bool IsWrite);
+  void instrumentMemIntrinsic(MemIntrinsic *MI);
+  Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+  bool instrumentFunction(Function &F);
+  bool maybeInsertHeapProfInitAtFunctionEntry(Function &F);
+  void insertDynamicShadowAtFunctionEntry(Function &F);
+
+private:
+  void initializeCallbacks(Module &M);
+
+  LLVMContext *C;
+  int LongSize;
+  Type *IntptrTy;
+  ShadowMapping Mapping;
+
+  // These arrays is indexed by AccessIsWrite
+  FunctionCallee HeapProfMemoryAccessCallback[2];
+  FunctionCallee HeapProfMemoryAccessCallbackSized[2];
+
+  FunctionCallee HeapProfMemmove, HeapProfMemcpy, HeapProfMemset;
+  Value *DynamicShadowOffset = nullptr;
+};
+
+class HeapProfilerLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  explicit HeapProfilerLegacyPass() : FunctionPass(ID) {
+    initializeHeapProfilerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "HeapProfilerFunctionPass"; }
+
+  bool runOnFunction(Function &F) override {
+    HeapProfiler Profiler(*F.getParent());
+    return Profiler.instrumentFunction(F);
+  }
+};
+
+class ModuleHeapProfiler {
+public:
+  ModuleHeapProfiler(Module &M) { TargetTriple = Triple(M.getTargetTriple()); }
+
+  bool instrumentModule(Module &);
+
+private:
+  Triple TargetTriple;
+  ShadowMapping Mapping;
+  Function *HeapProfCtorFunction = nullptr;
+};
+
+class ModuleHeapProfilerLegacyPass : public ModulePass {
+public:
+  static char ID;
+
+  explicit ModuleHeapProfilerLegacyPass() : ModulePass(ID) {
+    initializeModuleHeapProfilerLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "ModuleHeapProfiler"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {}
+
+  bool runOnModule(Module &M) override {
+    ModuleHeapProfiler HeapProfiler(M);
+    return HeapProfiler.instrumentModule(M);
+  }
+};
+
+} // end anonymous namespace
+
+HeapProfilerPass::HeapProfilerPass() {}
+
+PreservedAnalyses HeapProfilerPass::run(Function &F,
+                                        AnalysisManager<Function> &AM) {
+  Module &M = *F.getParent();
+  HeapProfiler Profiler(M);
+  if (Profiler.instrumentFunction(F))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+
+  return PreservedAnalyses::all();
+}
+
+ModuleHeapProfilerPass::ModuleHeapProfilerPass() {}
+
+PreservedAnalyses ModuleHeapProfilerPass::run(Module &M,
+                                              AnalysisManager<Module> &AM) {
+  ModuleHeapProfiler Profiler(M);
+  if (Profiler.instrumentModule(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+char HeapProfilerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(HeapProfilerLegacyPass, "heapprof",
+                      "HeapProfiler: profile heap allocations and accesses.",
+                      false, false)
+INITIALIZE_PASS_END(HeapProfilerLegacyPass, "heapprof",
+                    "HeapProfiler: profile heap allocations and accesses.",
+                    false, false)
+
+FunctionPass *llvm::createHeapProfilerFunctionPass() {
+  return new HeapProfilerLegacyPass();
+}
+
+char ModuleHeapProfilerLegacyPass::ID = 0;
+
+INITIALIZE_PASS(ModuleHeapProfilerLegacyPass, "heapprof-module",
+                "HeapProfiler: profile heap allocations and accesses."
+                "ModulePass",
+                false, false)
+
+ModulePass *llvm::createModuleHeapProfilerLegacyPassPass() {
+  return new ModuleHeapProfilerLegacyPass();
+}
+
+Value *HeapProfiler::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
+  // (Shadow & mask) >> scale
+  Shadow = IRB.CreateAnd(Shadow, Mapping.Mask);
+  Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
+  // (Shadow >> scale) | offset
+  assert(DynamicShadowOffset);
+  return IRB.CreateAdd(Shadow, DynamicShadowOffset);
+}
+
+// Instrument memset/memmove/memcpy
+void HeapProfiler::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  IRBuilder<> IRB(MI);
+  if (isa<MemTransferInst>(MI)) {
+    IRB.CreateCall(
+        isa<MemMoveInst>(MI) ? HeapProfMemmove : HeapProfMemcpy,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  } else if (isa<MemSetInst>(MI)) {
+    IRB.CreateCall(
+        HeapProfMemset,
+        {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+         IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
+         IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+  }
+  MI->eraseFromParent();
+}
+
+Optional<InterestingMemoryAccess>
+HeapProfiler::isInterestingMemoryAccess(Instruction *I) {
+  // Do not instrument the load fetching the dynamic shadow address.
+  if (DynamicShadowOffset == I)
+    return None;
+
+  InterestingMemoryAccess Access;
+
+  const DataLayout &DL = I->getModule()->getDataLayout();
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    if (!ClInstrumentReads)
+      return None;
+    Access.IsWrite = false;
+    Access.TypeSize = DL.getTypeStoreSizeInBits(LI->getType());
+    Access.Alignment = LI->getAlignment();
+    Access.Addr = LI->getPointerOperand();
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    if (!ClInstrumentWrites)
+      return None;
+    Access.IsWrite = true;
+    Access.TypeSize =
+        DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType());
+    Access.Alignment = SI->getAlignment();
+    Access.Addr = SI->getPointerOperand();
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+    if (!ClInstrumentAtomics)
+      return None;
+    Access.IsWrite = true;
+    Access.TypeSize =
+        DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType());
+    Access.Alignment = 0;
+    Access.Addr = RMW->getPointerOperand();
+  } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!ClInstrumentAtomics)
+      return None;
+    Access.IsWrite = true;
+    Access.TypeSize =
+        DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
+    Access.Alignment = 0;
+    Access.Addr = XCHG->getPointerOperand();
+  } else if (auto *CI = dyn_cast<CallInst>(I)) {
+    auto *F = CI->getCalledFunction();
+    if (F && (F->getIntrinsicID() == Intrinsic::masked_load ||
+              F->getIntrinsicID() == Intrinsic::masked_store)) {
+      unsigned OpOffset = 0;
+      if (F->getIntrinsicID() == Intrinsic::masked_store) {
+        if (!ClInstrumentWrites)
+          return None;
+        // Masked store has an initial operand for the value.
+        OpOffset = 1;
+        Access.IsWrite = true;
+      } else {
+        if (!ClInstrumentReads)
+          return None;
+        Access.IsWrite = false;
+      }
+
+      auto *BasePtr = CI->getOperand(0 + OpOffset);
+      auto *Ty = cast<PointerType>(BasePtr->getType())->getElementType();
+      Access.TypeSize = DL.getTypeStoreSizeInBits(Ty);
+      if (auto *AlignmentConstant =
+              dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
+        Access.Alignment = (unsigned)AlignmentConstant->getZExtValue();
+      else
+        Access.Alignment = 1; // No alignment guarantees. We probably got Undef
+      Access.MaybeMask = CI->getOperand(2 + OpOffset);
+      Access.Addr = BasePtr;
+    }
+  }
+
+  if (!Access.Addr)
+    return None;
+
+  // Do not instrument acesses from different address spaces; we cannot deal
+  // with them.
+  Type *PtrTy = cast<PointerType>(Access.Addr->getType()->getScalarType());
+  if (PtrTy->getPointerAddressSpace() != 0)
+    return None;
+
+  // Ignore swifterror addresses.
+  // swifterror memory addresses are mem2reg promoted by instruction
+  // selection. As such they cannot have regular uses like an instrumentation
+  // function and it makes no sense to track them as memory.
+  if (Access.Addr->isSwiftError())
+    return None;
+
+  return Access;
+}
+
+void HeapProfiler::instrumentMaskedLoadOrStore(const DataLayout &DL,
+                                               Value *Mask, Instruction *I,
+                                               Value *Addr, unsigned Alignment,
+                                               uint32_t TypeSize,
+                                               bool IsWrite) {
+  auto *VTy =
+      cast<VectorType>(cast<PointerType>(Addr->getType())->getElementType());
+  uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
+  unsigned Num = VTy->getNumElements();
+  auto *Zero = ConstantInt::get(IntptrTy, 0);
+  for (unsigned Idx = 0; Idx < Num; ++Idx) {
+    Value *InstrumentedAddress = nullptr;
+    Instruction *InsertBefore = I;
+    if (auto *Vector = dyn_cast<ConstantVector>(Mask)) {
+      // dyn_cast as we might get UndefValue
+      if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) {
+        if (Masked->isZero())
+          // Mask is constant false, so no instrumentation needed.
+          continue;
+        // If we have a true or undef value, fall through to instrumentAddress.
+        // with InsertBefore == I
+      }
+    } else {
+      IRBuilder<> IRB(I);
+      Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
+      Instruction *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
+      InsertBefore = ThenTerm;
+    }
+
+    IRBuilder<> IRB(InsertBefore);
+    InstrumentedAddress =
+        IRB.CreateGEP(VTy, Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
+    instrumentAddress(I, InsertBefore, InstrumentedAddress, ElemTypeSize,
+                      IsWrite);
+  }
+}
+
+void HeapProfiler::instrumentMop(Instruction *I, const DataLayout &DL,
+                                 InterestingMemoryAccess &Access) {
+  if (Access.IsWrite)
+    NumInstrumentedWrites++;
+  else
+    NumInstrumentedReads++;
+
+  if (Access.MaybeMask) {
+    instrumentMaskedLoadOrStore(DL, Access.MaybeMask, I, Access.Addr,
+                                Access.Alignment, Access.TypeSize,
+                                Access.IsWrite);
+  } else {
+    // Since the access counts will be accumulated across the entire allocation,
+    // we only update the shadow access count for the first location and thus
+    // don't need to worry about alignment and type size.
+    instrumentAddress(I, I, Access.Addr, Access.TypeSize, Access.IsWrite);
+  }
+}
+
+void HeapProfiler::instrumentAddress(Instruction *OrigIns,
+                                     Instruction *InsertBefore, Value *Addr,
+                                     uint32_t TypeSize, bool IsWrite) {
+  IRBuilder<> IRB(InsertBefore);
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+
+  if (ClUseCalls) {
+    IRB.CreateCall(HeapProfMemoryAccessCallback[IsWrite], AddrLong);
+    return;
+  }
+
+  // Create an inline sequence to compute shadow location, and increment the
+  // value by one.
+  Type *ShadowTy = Type::getInt64Ty(*C);
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Value *ShadowPtr = memToShadow(AddrLong, IRB);
+  Value *ShadowAddr = IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy);
+  Value *ShadowValue = IRB.CreateLoad(ShadowTy, ShadowAddr);
+  Value *Inc = ConstantInt::get(Type::getInt64Ty(*C), 1);
+  ShadowValue = IRB.CreateAdd(ShadowValue, Inc);
+  IRB.CreateStore(ShadowValue, ShadowAddr);
+}
+
+bool ModuleHeapProfiler::instrumentModule(Module &M) {
+  // Create a module constructor.
+  std::string HeapProfVersion = std::to_string(LLVM_HEAP_PROFILER_VERSION);
+  std::string VersionCheckName =
+      ClInsertVersionCheck ? (HeapProfVersionCheckNamePrefix + HeapProfVersion)
+                           : "";
+  std::tie(HeapProfCtorFunction, std::ignore) =
+      createSanitizerCtorAndInitFunctions(M, HeapProfModuleCtorName,
+                                          HeapProfInitName, /*InitArgTypes=*/{},
+                                          /*InitArgs=*/{}, VersionCheckName);
+
+  const uint64_t Priority = getCtorAndDtorPriority(TargetTriple);
+  appendToGlobalCtors(M, HeapProfCtorFunction, Priority);
+
+  return true;
+}
+
+void HeapProfiler::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
+
+  for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+    const std::string TypeStr = AccessIsWrite ? "store" : "load";
+
+    SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
+    SmallVector<Type *, 2> Args1{1, IntptrTy};
+    HeapProfMemoryAccessCallbackSized[AccessIsWrite] =
+        M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + TypeStr + "N",
+                              FunctionType::get(IRB.getVoidTy(), Args2, false));
+
+    HeapProfMemoryAccessCallback[AccessIsWrite] =
+        M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + TypeStr,
+                              FunctionType::get(IRB.getVoidTy(), Args1, false));
+  }
+  HeapProfMemmove = M.getOrInsertFunction(
+      ClMemoryAccessCallbackPrefix + "memmove", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy);
+  HeapProfMemcpy = M.getOrInsertFunction(
+      ClMemoryAccessCallbackPrefix + "memcpy", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy);
+  HeapProfMemset = M.getOrInsertFunction(
+      ClMemoryAccessCallbackPrefix + "memset", IRB.getInt8PtrTy(),
+      IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy);
+}
+
+bool HeapProfiler::maybeInsertHeapProfInitAtFunctionEntry(Function &F) {
+  // For each NSObject descendant having a +load method, this method is invoked
+  // by the ObjC runtime before any of the static constructors is called.
+  // Therefore we need to instrument such methods with a call to __heapprof_init
+  // at the beginning in order to initialize our runtime before any access to
+  // the shadow memory.
+  // We cannot just ignore these methods, because they may call other
+  // instrumented functions.
+  if (F.getName().find(" load]") != std::string::npos) {
+    FunctionCallee HeapProfInitFunction =
+        declareSanitizerInitFunction(*F.getParent(), HeapProfInitName, {});
+    IRBuilder<> IRB(&F.front(), F.front().begin());
+    IRB.CreateCall(HeapProfInitFunction, {});
+    return true;
+  }
+  return false;
+}
+
+void HeapProfiler::insertDynamicShadowAtFunctionEntry(Function &F) {
+  IRBuilder<> IRB(&F.front().front());
+  Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal(
+      HeapProfShadowMemoryDynamicAddress, IntptrTy);
+  DynamicShadowOffset = IRB.CreateLoad(IntptrTy, GlobalDynamicAddress);
+}
+
+bool HeapProfiler::instrumentFunction(Function &F) {
+  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
+    return false;
+  if (ClDebugFunc == F.getName())
+    return false;
+  if (F.getName().startswith("__heapprof_"))
+    return false;
+
+  bool FunctionModified = false;
+
+  // If needed, insert __heapprof_init.
+  // This function needs to be called even if the function body is not
+  // instrumented.
+  if (maybeInsertHeapProfInitAtFunctionEntry(F))
+    FunctionModified = true;
+
+  LLVM_DEBUG(dbgs() << "HEAPPROF instrumenting:\n" << F << "\n");
+
+  initializeCallbacks(*F.getParent());
+
+  insertDynamicShadowAtFunctionEntry(F);
+
+  SmallVector<Instruction *, 16> ToInstrument;
+
+  // Fill the set of memory operations to instrument.
+  for (auto &BB : F) {
+    for (auto &Inst : BB) {
+      if (isInterestingMemoryAccess(&Inst) || isa<MemIntrinsic>(Inst))
+        ToInstrument.push_back(&Inst);
+    }
+  }
+
+  int NumInstrumented = 0;
+  for (auto *Inst : ToInstrument) {
+    if (ClDebugMin < 0 || ClDebugMax < 0 ||
+        (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
+      Optional<InterestingMemoryAccess> Access =
+          isInterestingMemoryAccess(Inst);
+      if (Access)
+        instrumentMop(Inst, F.getParent()->getDataLayout(), *Access);
+      else
+        instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+    }
+    NumInstrumented++;
+  }
+
+  if (NumInstrumented > 0)
+    FunctionModified = true;
+
+  LLVM_DEBUG(dbgs() << "HEAPPROF done instrumenting: " << FunctionModified
+                    << " " << F << "\n");
+
+  return FunctionModified;
+}
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@ -105,6 +105,8 @@ Comdat *llvm::GetOrCreateFunctionComdat(Function &F, Triple &T,
 void llvm::initializeInstrumentation(PassRegistry &Registry) {
  initializeAddressSanitizerLegacyPassPass(Registry);
  initializeModuleAddressSanitizerLegacyPassPass(Registry);
+  initializeHeapProfilerLegacyPassPass(Registry);
+  initializeModuleHeapProfilerLegacyPassPass(Registry);
  initializeBoundsCheckingLegacyPassPass(Registry);
  initializeControlHeightReductionLegacyPassPass(Registry);
  initializeGCOVProfilerLegacyPassPass(Registry);
--- a/test/Instrumentation/HeapProfiler/basic.ll
+++ b/test/Instrumentation/HeapProfiler/basic.ll
@ -0,0 +1,179 @@
+; Test basic address sanitizer instrumentation.
+;
+; RUN: opt < %s -heapprof -heapprof-module -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s
+
+; We need the requires since both heapprof and heapprof-module require reading module level metadata which is done once by the heapprof-globals-md analysis
+; RUN: opt < %s -passes='function(heapprof),module(heapprof-module)' -S | FileCheck --check-prefixes=CHECK,CHECK-S3 %s
+; RUN: opt < %s -passes='function(heapprof),module(heapprof-module)' -heapprof-mapping-scale=5 -S | FileCheck --check-prefixes=CHECK,CHECK-S5 %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+; CHECK: @llvm.global_ctors = {{.*}}@heapprof.module_ctor
+
+define i32 @test_load(i32* %a) {
+entry:
+  %tmp1 = load i32, i32* %a, align 4
+  ret i32 %tmp1
+}
+; CHECK-LABEL: @test_load
+; CHECK:         %[[SHADOW_OFFSET:[^ ]*]] = load i64, i64* @__heapprof_shadow_memory_dynamic_address
+; CHECK-NEXT:    %[[LOAD_ADDR:[^ ]*]] = ptrtoint i32* %a to i64
+; CHECK-NEXT:    %[[MASKED_ADDR:[^ ]*]] = and i64 %[[LOAD_ADDR]], -64
+; CHECK-S3-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 3
+; CHECK-S5-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 5
+; CHECK-NEXT:    add i64 %[[SHIFTED_ADDR]], %[[SHADOW_OFFSET]]
+; CHECK-NEXT:    %[[LOAD_SHADOW_PTR:[^ ]*]] = inttoptr
+; CHECK-NEXT:    %[[LOAD_SHADOW:[^ ]*]] = load i64, i64* %[[LOAD_SHADOW_PTR]]
+; CHECK-NEXT:    %[[NEW_SHADOW:[^ ]*]] = add i64 %[[LOAD_SHADOW]], 1
+; CHECK-NEXT:    store i64 %[[NEW_SHADOW]], i64* %[[LOAD_SHADOW_PTR]]
+; The actual load.
+; CHECK-NEXT:    %tmp1 = load i32, i32* %a
+; CHECK-NEXT:    ret i32 %tmp1
+
+define void @test_store(i32* %a) {
+entry:
+  store i32 42, i32* %a, align 4
+  ret void
+}
+; CHECK-LABEL: @test_store
+; CHECK:         %[[SHADOW_OFFSET:[^ ]*]] = load i64, i64* @__heapprof_shadow_memory_dynamic_address
+; CHECK-NEXT:    %[[STORE_ADDR:[^ ]*]] = ptrtoint i32* %a to i64
+; CHECK-NEXT:    %[[MASKED_ADDR:[^ ]*]] = and i64 %[[STORE_ADDR]], -64
+; CHECK-S3-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 3
+; CHECK-S5-NEXT: %[[SHIFTED_ADDR:[^ ]*]] = lshr i64 %[[MASKED_ADDR]], 5
+; CHECK-NEXT:    add i64 %[[SHIFTED_ADDR]], %[[SHADOW_OFFSET]]
+; CHECK-NEXT:    %[[STORE_SHADOW_PTR:[^ ]*]] = inttoptr
+; CHECK-NEXT:    %[[STORE_SHADOW:[^ ]*]] = load i64, i64* %[[STORE_SHADOW_PTR]]
+; CHECK-NEXT:    %[[NEW_SHADOW:[^ ]*]] = add i64 %[[STORE_SHADOW]], 1
+; CHECK-NEXT:    store i64 %[[NEW_SHADOW]], i64* %[[STORE_SHADOW_PTR]]
+; The actual store.
+; CHECK-NEXT:    store i32 42, i32* %a
+; CHECK-NEXT:    ret void
+
+define void @FP80Test(x86_fp80* nocapture %a) nounwind uwtable {
+entry:
+    store x86_fp80 0xK3FFF8000000000000000, x86_fp80* %a, align 16
+    ret void
+}
+; CHECK-LABEL: @FP80Test
+; Exactly one shadow update for store access.
+; CHECK-NOT:  store i64
+; CHECK:      %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual store.
+; CHECK:      store x86_fp80 0xK3FFF8000000000000000, x86_fp80* %a
+; CHECK:      ret void
+
+define void @i40test(i40* %a, i40* %b) nounwind uwtable {
+entry:
+  %t = load i40, i40* %a
+  store i40 %t, i40* %b, align 8
+  ret void
+}
+; CHECK-LABEL: @i40test
+; Exactly one shadow update for load access.
+; CHECK-NOT:  store i64
+; CHECK:      %[[NEW_LD_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_LD_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual load.
+; CHECK:      %t = load i40, i40* %a
+; Exactly one shadow update for store access.
+; CHECK-NOT:  store i64
+; CHECK:      %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual store.
+; CHECK:      store i40 %t, i40* %b
+; CHECK:      ret void
+
+define void @i64test_align1(i64* %b) nounwind uwtable {
+  entry:
+  store i64 0, i64* %b, align 1
+  ret void
+}
+; CHECK-LABEL: @i64test
+; Exactly one shadow update for store access.
+; CHECK-NOT:  store i64
+; CHECK: %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual store.
+; CHECK:      store i64 0, i64* %b
+; CHECK:      ret void
+
+define void @i80test(i80* %a, i80* %b) nounwind uwtable {
+  entry:
+  %t = load i80, i80* %a
+  store i80 %t, i80* %b, align 8
+  ret void
+}
+; CHECK-LABEL: i80test
+; Exactly one shadow update for load access.
+; CHECK-NOT:  store i64
+; CHECK:      %[[NEW_LD_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_LD_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual load.
+; CHECK:      %t = load i80, i80* %a
+; Exactly one shadow update for store access.
+; CHECK-NOT:  store i64
+; CHECK:      %[[NEW_ST_SHADOW:[^ ]*]] = add i64 %{{.*}}, 1
+; CHECK-NEXT: store i64 %[[NEW_ST_SHADOW]]
+; CHECK-NOT:  store i64
+; The actual store.
+; CHECK:      store i80 %t, i80* %b
+; CHECK:      ret void
+
+; heapprof should not instrument functions with available_externally linkage.
+define available_externally i32 @f_available_externally(i32* %a)  {
+entry:
+  %tmp1 = load i32, i32* %a
+  ret i32 %tmp1
+}
+; CHECK-LABEL: @f_available_externally
+; CHECK-NOT: __heapprof_shadow_memory_dynamic_address
+; CHECK: ret i32
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) nounwind
+
+define void @memintr_test(i8* %a, i8* %b) nounwind uwtable {
+  entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 100, i1 false)
+  tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %a, i8* %b, i64 100, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 100, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: memintr_test
+; CHECK: __heapprof_memset
+; CHECK: __heapprof_memmove
+; CHECK: __heapprof_memcpy
+; CHECK: ret void
+
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture writeonly, i8, i64, i32) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32) nounwind
+
+define void @memintr_element_atomic_test(i8* %a, i8* %b) nounwind uwtable {
+  ; This is a canary test to make sure that these don't get lowered into calls that don't
+  ; have the element-atomic property. Eventually, heapprof will have to be enhanced to lower
+  ; these properly.
+  ; CHECK-LABEL: memintr_element_atomic_test
+  ; CHECK: tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %a, i8 0, i64 100, i32 1)
+  ; CHECK: tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  ; CHECK: tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  ; CHECK: ret void
+  tail call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 1 %a, i8 0, i64 100, i32 1)
+  tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %a, i8* align 1 %b, i64 100, i32 1)
+  ret void
+}
+
+
+; CHECK: define internal void @heapprof.module_ctor()
+; CHECK: call void @__heapprof_init()
--- a/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll
+++ b/test/Instrumentation/HeapProfiler/instrumentation-use-callbacks.ll
@ -0,0 +1,36 @@
+; Test heapprof internal compiler flags:
+;   -heapprof-use-callbacks
+;   -heapprof-memory-access-callback-prefix
+
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-use-callbacks -S | FileCheck %s --check-prefix=CHECK-CALL --check-prefix=CHECK-CALL-DEFAULT
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-use-callbacks -heapprof-memory-access-callback-prefix=__foo_ -S | FileCheck %s --check-prefix=CHECK-CALL --check-prefix=CHECK-CALL-CUSTOM
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-use-callbacks=false -S | FileCheck %s --check-prefix=CHECK-INLINE
+; RUN: opt < %s -heapprof -heapprof-module  -S | FileCheck %s --check-prefix=CHECK-INLINE
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test_load(i32* %a, i64* %b, i512* %c, i80* %d) {
+entry:
+; CHECK-CALL:             %[[LOAD_ADDR1:[^ ]*]] = ptrtoint i32* %a to i64
+; CHECK-CALL-DEFAULT:     call void @__heapprof_load(i64 %[[LOAD_ADDR1]])
+; CHECK-CALL-CUSTOM:      call void @__foo_load(i64 %[[LOAD_ADDR1]])
+; CHECK-CALL:             %[[LOAD_ADDR2:[^ ]*]] = ptrtoint i64* %b to i64
+; CHECK-CALL-DEFAULT:     call void @__heapprof_load(i64 %[[LOAD_ADDR2]])
+; CHECK-CALL-CUSTOM:      call void @__foo_load(i64 %[[LOAD_ADDR2]])
+; CHECK-CALL:             %[[LOAD_ADDR3:[^ ]*]] = ptrtoint i512* %c to i64
+; CHECK-CALL-DEFAULT:     call void @__heapprof_load(i64 %[[LOAD_ADDR3]])
+; CHECK-CALL-CUSTOM:      call void @__foo_load(i64 %[[LOAD_ADDR3]])
+; CHECK-CALL:             %[[LOAD_ADDR4:[^ ]*]] = ptrtoint i80* %d to i64
+; CHECK-CALL-DEFAULT:     call void @__heapprof_load(i64 %[[LOAD_ADDR4]])
+; CHECK-CALL-CUSTOM:      call void @__foo_load(i64 %[[LOAD_ADDR4]])
+; CHECK-CALL-DEFAULT-NOT: call void @__heapprof_load
+; CHECK-CALL-CUSTOM-NOT:  call void @__foo_load
+; CHECK-INLINE-NOT:       call void @__heapprof_load
+  %tmp1 = load i32, i32* %a, align 4
+  %tmp2 = load i64, i64* %b, align 8
+  %tmp3 = load i512, i512* %c, align 32
+  %tmp4 = load i80, i80* %d, align 8
+  ret void
+}
+
+
--- a/test/Instrumentation/HeapProfiler/masked-load-store.ll
+++ b/test/Instrumentation/HeapProfiler/masked-load-store.ll
@ -0,0 +1,246 @@
+; RUN: opt < %s -heapprof -heapprof-use-callbacks -S \
+; RUN:     | FileCheck %s -check-prefix=LOAD -check-prefix=STORE -check-prefix=ALL
+; RUN: opt < %s -heapprof -heapprof-use-callbacks -heapprof-instrument-reads=0 -S \
+; RUN:     | FileCheck %s -check-prefix=NOLOAD -check-prefix=STORE -check-prefix=ALL
+; RUN: opt < %s -heapprof -heapprof-use-callbacks -heapprof-instrument-writes=0 -S \
+; RUN:     | FileCheck %s -check-prefix=LOAD -check-prefix=NOSTORE -check-prefix=ALL
+; RUN: opt < %s -heapprof -heapprof-use-callbacks -heapprof-instrument-reads=0 -heapprof-instrument-writes=0 -S \
+; RUN:     | FileCheck %s -check-prefix=NOLOAD -check-prefix=NOSTORE -check-prefix=ALL
+; Support heap profiling instrumentation for constant-mask llvm.masked.{load,store}
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@v4f32 = global <4 x float>* zeroinitializer, align 8
+@v8i32 = global <8 x i32>* zeroinitializer, align 8
+@v4i64 = global <4 x i32*>* zeroinitializer, align 8
+
+;;;;;;;;;;;;;;;; STORE
+declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) argmemonly nounwind
+declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32, <8 x i1>) argmemonly nounwind
+declare void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*>, <4 x i32*>*, i32, <4 x i1>) argmemonly nounwind
+
+define void @store.v4f32.1110(<4 x float> %arg) {
+; ALL-LABEL: @store.v4f32.1110
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; NOSTORE-NOT: call void @__heapprof_store
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP0]])
+; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1
+; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP1]])
+; STORE: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; STORE: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP2]])
+; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
+  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 false>)
+  ret void
+}
+
+define void @store.v8i32.10010110(<8 x i32> %arg) {
+; ALL-LABEL: @store.v8i32.10010110
+  %p = load <8 x i32>*, <8 x i32>** @v8i32, align 8
+; NOSTORE-NOT: call void @__heapprof_store
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP0]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP0]])
+; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 3
+; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP3]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP3]])
+; STORE: [[GEP5:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 5
+; STORE: [[PGEP5:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP5]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP5]])
+; STORE: [[GEP6:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 6
+; STORE: [[PGEP6:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP6]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP6]])
+; STORE: tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %arg, <8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
+  tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %arg, <8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false>)
+  ret void
+}
+
+define void @store.v4i64.0001(<4 x i32*> %arg) {
+; ALL-LABEL: @store.v4i64.0001
+  %p = load <4 x i32*>*, <4 x i32*>** @v4i64, align 8
+; NOSTORE-NOT: call void @__heapprof_store
+; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x i32*>, <4 x i32*>* %p, i64 0, i64 3
+; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32** [[GEP3]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP3]])
+; STORE: tail call void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*> %arg, <4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+  tail call void @llvm.masked.store.v4p0i32.p0v4p0i32(<4 x i32*> %arg, <4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>)
+  ret void
+}
+
+define void @store.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) {
+; ALL-LABEL: @store.v4f32.variable
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; STORE: [[MASK0:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 0
+; STORE: br i1 [[MASK0]], label %[[THEN0:[0-9A-Za-z]+]], label %[[AFTER0:[0-9A-Za-z]+]]
+; STORE: [[THEN0]]:
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP0]])
+; STORE: br label %[[AFTER0]]
+; STORE: [[AFTER0]]:
+
+; STORE: [[MASK1:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 1
+; STORE: br i1 [[MASK1]], label %[[THEN1:[0-9A-Za-z]+]], label %[[AFTER1:[0-9A-Za-z]+]]
+; STORE: [[THEN1]]:
+; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1
+; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP1]])
+; STORE: br label %[[AFTER1]]
+; STORE: [[AFTER1]]:
+
+; STORE: [[MASK2:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 2
+; STORE: br i1 [[MASK2]], label %[[THEN2:[0-9A-Za-z]+]], label %[[AFTER2:[0-9A-Za-z]+]]
+; STORE: [[THEN2]]:
+; STORE: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; STORE: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP2]])
+; STORE: br label %[[AFTER2]]
+; STORE: [[AFTER2]]:
+
+; STORE: [[MASK3:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 3
+; STORE: br i1 [[MASK3]], label %[[THEN3:[0-9A-Za-z]+]], label %[[AFTER3:[0-9A-Za-z]+]]
+; STORE: [[THEN3]]:
+; STORE: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; STORE: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP3]])
+; STORE: br label %[[AFTER3]]
+; STORE: [[AFTER3]]:
+
+; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> %mask)
+  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> %mask)
+  ret void
+}
+
+;; Store using two masked.stores, which should instrument them both.
+define void @store.v4f32.1010.split(<4 x float> %arg) {
+; BOTH-LABEL: @store.v4f32.1010.split
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; STORE: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; STORE: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP0]])
+; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>)
+; STORE: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; STORE: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; STORE: call void @__heapprof_store(i64 [[PGEP1]])
+; STORE: tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %arg, <4 x float>* %p, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>)
+  ret void
+}
+
+;;;;;;;;;;;;;;;; LOAD
+declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) argmemonly nounwind
+declare <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>*, i32, <8 x i1>, <8 x i32>) argmemonly nounwind
+declare <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>*, i32, <4 x i1>, <4 x i32*>) argmemonly nounwind
+
+define <8 x i32> @load.v8i32.11100001(<8 x i32> %arg) {
+; ALL-LABEL: @load.v8i32.11100001
+  %p = load <8 x i32>*, <8 x i32>** @v8i32, align 8
+; NOLOAD-NOT: call void @__heapprof_load
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP0]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP0]])
+; LOAD: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 1
+; LOAD: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP1]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP1]])
+; LOAD: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 2
+; LOAD: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP2]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP2]])
+; LOAD: [[GEP7:%[0-9A-Za-z]+]] = getelementptr <8 x i32>, <8 x i32>* %p, i64 0, i64 7
+; LOAD: [[PGEP7:%[0-9A-Za-z]+]] = ptrtoint i32* [[GEP7]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP7]])
+; LOAD: tail call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
+  %res = tail call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false, i1 true>, <8 x i32> %arg)
+  ret <8 x i32> %res
+}
+
+define <4 x float> @load.v4f32.1001(<4 x float> %arg) {
+; ALL-LABEL: @load.v4f32.1001
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; NOLOAD-NOT: call void @__heapprof_load
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP0]])
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP3]])
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
+  %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %arg)
+  ret <4 x float> %res
+}
+
+define <4 x i32*> @load.v4i64.0001(<4 x i32*> %arg) {
+; ALL-LABEL: @load.v4i64.0001
+  %p = load <4 x i32*>*, <4 x i32*>** @v4i64, align 8
+; NOLOAD-NOT: call void @__heapprof_load
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x i32*>, <4 x i32*>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint i32** [[GEP3]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP3]])
+; LOAD: tail call <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32*> %arg)
+  %res = tail call <4 x i32*> @llvm.masked.load.v4p0i32.p0v4p0i32(<4 x i32*>* %p, i32 8, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32*> %arg)
+  ret <4 x i32*> %res
+}
+
+define <4 x float> @load.v4f32.variable(<4 x float> %arg, <4 x i1> %mask) {
+; ALL-LABEL: @load.v4f32.variable
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; LOAD: [[MASK0:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 0
+; LOAD: br i1 [[MASK0]], label %[[THEN0:[0-9A-Za-z]+]], label %[[AFTER0:[0-9A-Za-z]+]]
+; LOAD: [[THEN0]]:
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP0]])
+; LOAD: br label %[[AFTER0]]
+; LOAD: [[AFTER0]]:
+
+; LOAD: [[MASK1:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 1
+; LOAD: br i1 [[MASK1]], label %[[THEN1:[0-9A-Za-z]+]], label %[[AFTER1:[0-9A-Za-z]+]]
+; LOAD: [[THEN1]]:
+; LOAD: [[GEP1:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 1
+; LOAD: [[PGEP1:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP1]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP1]])
+; LOAD: br label %[[AFTER1]]
+; LOAD: [[AFTER1]]:
+
+; LOAD: [[MASK2:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 2
+; LOAD: br i1 [[MASK2]], label %[[THEN2:[0-9A-Za-z]+]], label %[[AFTER2:[0-9A-Za-z]+]]
+; LOAD: [[THEN2]]:
+; LOAD: [[GEP2:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 2
+; LOAD: [[PGEP2:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP2]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP2]])
+; LOAD: br label %[[AFTER2]]
+; LOAD: [[AFTER2]]:
+
+; LOAD: [[MASK3:%[0-9A-Za-z]+]] = extractelement <4 x i1> %mask, i64 3
+; LOAD: br i1 [[MASK3]], label %[[THEN3:[0-9A-Za-z]+]], label %[[AFTER3:[0-9A-Za-z]+]]
+; LOAD: [[THEN3]]:
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP3]])
+; LOAD: br label %[[AFTER3]]
+; LOAD: [[AFTER3]]:
+
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
+  %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> %mask, <4 x float> %arg)
+  ret <4 x float> %res
+}
+
+;; Load using two masked.loads, which should instrument them both.
+define <4 x float> @load.v4f32.1001.split(<4 x float> %arg) {
+; BOTH-LABEL: @load.v4f32.1001
+  %p = load <4 x float>*, <4 x float>** @v4f32, align 8
+; LOAD: [[GEP0:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 0
+; LOAD: [[PGEP0:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP0]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP0]])
+; LOAD: %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %arg)
+  %res = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %arg)
+; LOAD: [[GEP3:%[0-9A-Za-z]+]] = getelementptr <4 x float>, <4 x float>* %p, i64 0, i64 3
+; LOAD: [[PGEP3:%[0-9A-Za-z]+]] = ptrtoint float* [[GEP3]] to i64
+; LOAD: call void @__heapprof_load(i64 [[PGEP3]])
+; LOAD: tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %res)
+  %res2 = tail call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %p, i32 4, <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %res)
+  ret <4 x float> %res2
+}
--- a/test/Instrumentation/HeapProfiler/scale-granularity.ll
+++ b/test/Instrumentation/HeapProfiler/scale-granularity.ll
@ -0,0 +1,29 @@
+; Test that the scale (-heapprof-mapping-scale) and granularity (-heapprof-mapping-granularity) command-line options work as expected
+;
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-granularity 32 -S | FileCheck --check-prefix=CHECK-GRAN %s
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-scale 1 -S | FileCheck --check-prefix=CHECK-SCALE %s
+; RUN: opt < %s -heapprof -heapprof-module -heapprof-mapping-granularity 16 -heapprof-mapping-scale 0 -S | FileCheck --check-prefix=CHECK-BOTH %s
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @read(i32* %a) {
+entry:
+  %tmp1 = load i32, i32* %a, align 4
+  ret i32 %tmp1
+}
+; CHECK-GRAN-LABEL: @read
+; CHECK-GRAN-NOT:     ret
+; CHECK-GRAN:         and {{.*}} -32
+; CHECK-GRAN-NEXT:    lshr {{.*}} 3
+; CHECK-GRAN:         ret
+
+; CHECK-SCALE-LABEL: @read
+; CHECK-SCALE-NOT:     ret
+; CHECK-SCALE:         and {{.*}} -64
+; CHECK-SCALE-NEXT:    lshr {{.*}} 1
+; CHECK-SCALE:         ret
+
+; CHECK-BOTH-LABEL: @read
+; CHECK-BOTH-NOT:     ret
+; CHECK-BOTH:         and {{.*}} -16
+; CHECK-BOTH-NEXT:    lshr {{.*}} 0
+; CHECK-BOTH:         ret
--- a/test/Instrumentation/HeapProfiler/version-mismatch-check.ll
+++ b/test/Instrumentation/HeapProfiler/version-mismatch-check.ll
@ -0,0 +1,12 @@
+; Check that the HeapProf module constructor guards against compiler/runtime version
+; mismatch.
+
+; RUN: opt < %s -heapprof-module -S | FileCheck %s
+; RUN: opt < %s -heapprof-module -heapprof-guard-against-version-mismatch=0 -S | FileCheck %s --check-prefix=NOGUARD
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: define internal void @heapprof.module_ctor()
+; CHECK:         call void @__heapprof_version_mismatch_check_v1
+; NOGUARD-NOT:   call void @__heapprof_version_mismatch_check_