diff --git a/include/llvm/Transforms/IPO/ProfiledCallGraph.h b/include/llvm/Transforms/IPO/ProfiledCallGraph.h
new file mode 100644
index 00000000000..8eea41ca77a
--- /dev/null
+++ b/include/llvm/Transforms/IPO/ProfiledCallGraph.h
@@ -0,0 +1,135 @@
+//===-- ProfiledCallGraph.h - Profiled Call Graph ----------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_PROFGEN_PROFILEDCALLGRAPH_H
+#define LLVM_TOOLS_LLVM_PROFGEN_PROFILEDCALLGRAPH_H
+
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Transforms/IPO/SampleContextTracker.h"
+#include <queue>
+#include <set>
+#include <string>
+
+using namespace llvm;
+using namespace sampleprof;
+
+namespace llvm {
+namespace sampleprof {
+
+struct ProfiledCallGraphNode {
+  ProfiledCallGraphNode(StringRef FName = StringRef()) : Name(FName) {}
+  StringRef Name;
+
+  struct ProfiledCallGraphNodeComparer {
+    bool operator()(const ProfiledCallGraphNode *L,
+                    const ProfiledCallGraphNode *R) const {
+      return L->Name < R->Name;
+    }
+  };
+  std::set<ProfiledCallGraphNode *, ProfiledCallGraphNodeComparer> Callees;
+};
+
+class ProfiledCallGraph {
+public:
+  using iterator = std::set<ProfiledCallGraphNode *>::iterator;
+  ProfiledCallGraph(StringMap<FunctionSamples> &ProfileMap,
+                    SampleContextTracker &ContextTracker) {
+    // Add all profiled functions into profiled call graph.
+    // We only add function with actual context profile
+    for (auto &FuncSample : ProfileMap) {
+      FunctionSamples *FSamples = &FuncSample.second;
+      addProfiledFunction(FSamples->getName());
+    }
+
+    // BFS traverse the context profile trie to add call edges for
+    // both samples calls as well as calls shown in context.
+    std::queue<ContextTrieNode *> Queue;
+    Queue.push(&ContextTracker.getRootContext());
+    while (!Queue.empty()) {
+      ContextTrieNode *Caller = Queue.front();
+      Queue.pop();
+      FunctionSamples *CallerSamples = Caller->getFunctionSamples();
+
+      // Add calls for context, if both caller and callee has context profile.
+      for (auto &Child : Caller->getAllChildContext()) {
+        ContextTrieNode *Callee = &Child.second;
+        Queue.push(Callee);
+        if (CallerSamples && Callee->getFunctionSamples()) {
+          addProfiledCall(Caller->getFuncName(), Callee->getFuncName());
+        }
+      }
+
+      // Add calls from call site samples
+      if (CallerSamples) {
+        for (auto &LocCallSite : CallerSamples->getBodySamples()) {
+          for (auto &NameCallSite : LocCallSite.second.getCallTargets()) {
+            addProfiledCall(Caller->getFuncName(), NameCallSite.first());
+          }
+        }
+      }
+    }
+  }
+
+  iterator begin() { return Root.Callees.begin(); }
+  iterator end() { return Root.Callees.end(); }
+  ProfiledCallGraphNode *getEntryNode() { return &Root; }
+  void addProfiledFunction(StringRef Name) {
+    if (!ProfiledFunctions.count(Name)) {
+      // Link to synthetic root to make sure every node is reachable
+      // from root. This does not affect SCC order.
+      Root.Callees.insert(&ProfiledFunctions[Name]);
+      ProfiledFunctions[Name] = ProfiledCallGraphNode(Name);
+    }
+  }
+  void addProfiledCall(StringRef CallerName, StringRef CalleeName) {
+    assert(ProfiledFunctions.count(CallerName));
+    auto CalleeIt = ProfiledFunctions.find(CalleeName);
+    if (CalleeIt == ProfiledFunctions.end()) {
+      return;
+    }
+    ProfiledFunctions[CallerName].Callees.insert(&CalleeIt->second);
+  }
+
+private:
+  ProfiledCallGraphNode Root;
+  StringMap<ProfiledCallGraphNode> ProfiledFunctions;
+};
+
+} // end namespace sampleprof
+
+template <> struct GraphTraits<ProfiledCallGraphNode *> {
+  using NodeRef = ProfiledCallGraphNode *;
+  using ChildIteratorType = std::set<ProfiledCallGraphNode *>::iterator;
+
+  static NodeRef getEntryNode(NodeRef PCGN) { return PCGN; }
+  static ChildIteratorType child_begin(NodeRef N) { return N->Callees.begin(); }
+  static ChildIteratorType child_end(NodeRef N) { return N->Callees.end(); }
+};
+
+template <>
+struct GraphTraits<ProfiledCallGraph *>
+    : public GraphTraits<ProfiledCallGraphNode *> {
+  static NodeRef getEntryNode(ProfiledCallGraph *PCG) {
+    return PCG->getEntryNode();
+  }
+
+  static ChildIteratorType nodes_begin(ProfiledCallGraph *PCG) {
+    return PCG->begin();
+  }
+
+  static ChildIteratorType nodes_end(ProfiledCallGraph *PCG) {
+    return PCG->end();
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/Transforms/IPO/SampleContextTracker.h b/include/llvm/Transforms/IPO/SampleContextTracker.h
index bc8f602795a..685a060fe46 100644
--- a/include/llvm/Transforms/IPO/SampleContextTracker.h
+++ b/include/llvm/Transforms/IPO/SampleContextTracker.h
@@ -114,13 +114,14 @@ public:
   FunctionSamples *getBaseSamplesFor(const Function &Func,
                                      bool MergeContext = true);
   // Query base profile for a given function by name.
-  FunctionSamples *getBaseSamplesFor(StringRef Name, bool MergeContext);
+  FunctionSamples *getBaseSamplesFor(StringRef Name, bool MergeContext = true);
   // Retrieve the context trie node for given profile context
   ContextTrieNode *getContextFor(const SampleContext &Context);
   // Mark a context profile as inlined when function is inlined.
   // This makes sure that inlined context profile will be excluded in
   // function's base profile.
   void markContextSamplesInlined(const FunctionSamples *InlinedSamples);
+  ContextTrieNode &getRootContext();
   void promoteMergeContextSamplesTree(const Instruction &Inst,
                                       StringRef CalleeName);
   void addCallGraphEdges(CallGraph &CG, StringMap<Function *> &SymbolMap);
diff --git a/lib/Transforms/IPO/SampleContextTracker.cpp b/lib/Transforms/IPO/SampleContextTracker.cpp
index 5ad0ba20b3e..863e8f3833f 100644
--- a/lib/Transforms/IPO/SampleContextTracker.cpp
+++ b/lib/Transforms/IPO/SampleContextTracker.cpp
@@ -328,6 +328,8 @@ void SampleContextTracker::markContextSamplesInlined(
   InlinedSamples->getContext().setState(InlinedContext);
 }
 
+ContextTrieNode &SampleContextTracker::getRootContext() { return RootContext; }
+
 void SampleContextTracker::promoteMergeContextSamplesTree(
     const Instruction &Inst, StringRef CalleeName) {
   LLVM_DEBUG(dbgs() << "Promoting and merging context tree for instr: \n"
@@ -490,6 +492,7 @@ SampleContextTracker::getOrCreateContextPath(const SampleContext &Context,
 }
 
 ContextTrieNode *SampleContextTracker::getTopLevelContextNode(StringRef FName) {
+  assert(!FName.empty() && "Top level node query must provide valid name");
   return RootContext.getChildContext(LineLocation(0, 0), FName);
 }
 
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index 548a8ad216b..79d68f2c62c 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -175,41 +175,42 @@ static cl::opt<bool> ProfileSizeInline(
     cl::desc("Inline cold call sites in profile loader if it's beneficial "
              "for code size."));
 
-static cl::opt<int> ProfileInlineGrowthLimit(
+cl::opt<int> ProfileInlineGrowthLimit(
     "sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
     cl::desc("The size growth ratio limit for proirity-based sample profile "
              "loader inlining."));
 
-static cl::opt<int> ProfileInlineLimitMin(
+cl::opt<int> ProfileInlineLimitMin(
     "sample-profile-inline-limit-min", cl::Hidden, cl::init(100),
     cl::desc("The lower bound of size growth limit for "
              "proirity-based sample profile loader inlining."));
 
-static cl::opt<int> ProfileInlineLimitMax(
+cl::opt<int> ProfileInlineLimitMax(
     "sample-profile-inline-limit-max", cl::Hidden, cl::init(10000),
     cl::desc("The upper bound of size growth limit for "
              "proirity-based sample profile loader inlining."));
 
+cl::opt<int> SampleHotCallSiteThreshold(
+    "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
+    cl::desc("Hot callsite threshold for proirity-based sample profile loader "
+             "inlining."));
+
+cl::opt<int> SampleColdCallSiteThreshold(
+    "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
+    cl::desc("Threshold for inlining cold callsites"));
+
 static cl::opt<int> ProfileICPThreshold(
     "sample-profile-icp-threshold", cl::Hidden, cl::init(5),
     cl::desc(
         "Relative hotness threshold for indirect "
         "call promotion in proirity-based sample profile loader inlining."));
 
-static cl::opt<int> SampleHotCallSiteThreshold(
-    "sample-profile-hot-inline-threshold", cl::Hidden, cl::init(3000),
-    cl::desc("Hot callsite threshold for proirity-based sample profile loader "
-             "inlining."));
-
 static cl::opt<bool> CallsitePrioritizedInline(
     "sample-profile-prioritized-inline", cl::Hidden, cl::ZeroOrMore,
     cl::init(false),
     cl::desc("Use call site prioritized inlining for sample profile loader."
              "Currently only CSSPGO is supported."));
 
-static cl::opt<int> SampleColdCallSiteThreshold(
-    "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
-    cl::desc("Threshold for inlining cold callsites"));
 
 static cl::opt<std::string> ProfileInlineReplayFile(
     "sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
diff --git a/test/tools/llvm-profgen/cs-preinline.test b/test/tools/llvm-profgen/cs-preinline.test
new file mode 100644
index 00000000000..e9aa7cbc73a
--- /dev/null
+++ b/test/tools/llvm-profgen/cs-preinline.test
@@ -0,0 +1,41 @@
+; Test default llvm-profgen with preinline off
+; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-noprobe.perfscript --binary=%S/Inputs/inline-cs-noprobe.perfbin --output=%t
+; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-DEFAULT
+
+; Test llvm-profgen with preinliner on will merge not inlinable profile into base profile.
+; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-noprobe.perfscript --binary=%S/Inputs/inline-cs-noprobe.perfbin --output=%t --csspgo-preinliner=1
+; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-PREINL
+
+; Test preinliner threshold that prevents all possible inlining and merges everything into base profile.
+; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-noprobe.perfscript --binary=%S/Inputs/inline-cs-noprobe.perfbin --output=%t --csspgo-preinliner=1  -sample-profile-hot-inline-threshold=0
+; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-NO-PREINL
+
+; CHECK-DEFAULT:     [main:1 @ foo]:309:0
+; CHECK-DEFAULT-NEXT: 2.1: 14
+; CHECK-DEFAULT-NEXT: 3: 15
+; CHECK-DEFAULT-NEXT: 3.1: 14 bar:14
+; CHECK-DEFAULT-NEXT: 3.2: 1
+; CHECK-DEFAULT-NEXT: !Attributes: 1
+; CHECK-DEFAULT-NEXT:[main:1 @ foo:3.1 @ bar]:84:0
+; CHECK-DEFAULT-NEXT: 1: 14
+; CHECK-DEFAULT-NEXT: !Attributes: 1
+
+; CHECK-PREINL:     [foo]:309:0
+; CHECK-PREINL-NEXT: 2.1: 14
+; CHECK-PREINL-NEXT: 3: 15
+; CHECK-PREINL-NEXT: 3.1: 14 bar:14
+; CHECK-PREINL-NEXT: 3.2: 1
+; CHECK-PREINL-NEXT: !Attributes: 1
+; CHECK-PREINL-NEXT:[foo:3.1 @ bar]:84:0
+; CHECK-PREINL-NEXT: 1: 14
+; CHECK-PREINL-NEXT: !Attributes: 3
+
+; CHECK-NO-PREINL:     [foo]:309:0
+; CHECK-NO-PREINL-NEXT: 2.1: 14
+; CHECK-NO-PREINL-NEXT: 3: 15
+; CHECK-NO-PREINL-NEXT: 3.1: 14 bar:14
+; CHECK-NO-PREINL-NEXT: 3.2: 1
+; CHECK-NO-PREINL-NEXT: !Attributes: 1
+; CHECK-NO-PREINL-NEXT:[bar]:84:0
+; CHECK-NO-PREINL-NEXT: 1: 14
+; CHECK-NO-PREINL-NEXT: !Attributes: 1
diff --git a/tools/llvm-profgen/CMakeLists.txt b/tools/llvm-profgen/CMakeLists.txt
index e7705eb21c9..949b45ff2f9 100644
--- a/tools/llvm-profgen/CMakeLists.txt
+++ b/tools/llvm-profgen/CMakeLists.txt
@@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS
   AllTargetsInfos
   Core
   MC
+  IPO
   MCDisassembler
   Object
   ProfileData
@@ -15,6 +16,7 @@ set(LLVM_LINK_COMPONENTS
 add_llvm_tool(llvm-profgen
   llvm-profgen.cpp
   PerfReader.cpp
+  CSPreInliner.cpp
   ProfiledBinary.cpp
   ProfileGenerator.cpp
   PseudoProbe.cpp
diff --git a/tools/llvm-profgen/CSPreInliner.cpp b/tools/llvm-profgen/CSPreInliner.cpp
new file mode 100644
index 00000000000..74cd09c25de
--- /dev/null
+++ b/tools/llvm-profgen/CSPreInliner.cpp
@@ -0,0 +1,229 @@
+//===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CSPreInliner.h"
+#include "llvm/ADT/SCCIterator.h"
+#include <cstdint>
+#include <queue>
+
+#define DEBUG_TYPE "cs-preinliner"
+
+using namespace llvm;
+using namespace sampleprof;
+
+static cl::opt<bool> EnableCSPreInliner(
+    "csspgo-preinliner", cl::Hidden, cl::init(false),
+    cl::desc("Run a global pre-inliner to merge context profile based on "
+             "estimated global top-down inline decisions"));
+
+// The switches specify inline thresholds used in SampleProfileLoader inlining.
+// TODO: the actual threshold to be tuned here because the size here is based
+// on machine code not LLVM IR.
+extern cl::opt<int> SampleHotCallSiteThreshold;
+extern cl::opt<int> SampleColdCallSiteThreshold;
+extern cl::opt<int> ProfileInlineGrowthLimit;
+extern cl::opt<int> ProfileInlineLimitMin;
+extern cl::opt<int> ProfileInlineLimitMax;
+
+static cl::opt<bool> SamplePreInlineReplay(
+    "csspgo-replay-preinline", cl::Hidden, cl::init(false),
+    cl::desc(
+        "Replay previous inlining and adjust context profile accordingly"));
+
+CSPreInliner::CSPreInliner(StringMap<FunctionSamples> &Profiles,
+                           uint64_t HotThreshold, uint64_t ColdThreshold)
+    : ContextTracker(Profiles), ProfileMap(Profiles),
+      HotCountThreshold(HotThreshold), ColdCountThreshold(ColdThreshold) {}
+
+std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
+  std::vector<StringRef> Order;
+  ProfiledCallGraph ProfiledCG(ProfileMap, ContextTracker);
+
+  // Now that we have a profiled call graph, construct top-down order
+  // by building up SCC and reversing SCC order.
+  scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
+  while (!I.isAtEnd()) {
+    for (ProfiledCallGraphNode *Node : *I) {
+      if (Node != ProfiledCG.getEntryNode())
+        Order.push_back(Node->Name);
+    }
+    ++I;
+  }
+  std::reverse(Order.begin(), Order.end());
+
+  return Order;
+}
+
+bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
+                                       const FunctionSamples *CallerSamples) {
+  assert(CallerSamples && "Expect non-null caller samples");
+
+  // Ideally we want to consider everything a function calls, but as far as
+  // context profile is concerned, only those frames that are children of
+  // current one in the trie is relavent. So we walk the trie instead of call
+  // targets from function profile.
+  ContextTrieNode *CallerNode =
+      ContextTracker.getContextFor(CallerSamples->getContext());
+
+  bool HasNewCandidate = false;
+  for (auto &Child : CallerNode->getAllChildContext()) {
+    ContextTrieNode *CalleeNode = &Child.second;
+    FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
+    if (!CalleeSamples)
+      continue;
+
+    // Call site count is more reliable, so we look up the corresponding call
+    // target profile in caller's context profile to retrieve call site count.
+    uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples();
+    uint64_t CallsiteCount = 0;
+    LineLocation Callsite = CalleeNode->getCallSiteLoc();
+    if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
+      SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
+      auto It = TargetCounts.find(CalleeSamples->getName());
+      if (It != TargetCounts.end())
+        CallsiteCount = It->second;
+    }
+
+    // TODO: call site and callee entry count should be mostly consistent, add
+    // check for that.
+    HasNewCandidate = true;
+    CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount));
+  }
+
+  return HasNewCandidate;
+}
+
+bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
+  // If replay inline is requested, simply follow the inline decision of the
+  // profiled binary.
+  if (SamplePreInlineReplay)
+    return Candidate.CalleeSamples->getContext().hasAttribute(
+        ContextWasInlined);
+
+  // Adjust threshold based on call site hotness, only do this for callsite
+  // prioritized inliner because otherwise cost-benefit check is done earlier.
+  unsigned int SampleThreshold = SampleColdCallSiteThreshold;
+  if (Candidate.CallsiteCount > HotCountThreshold)
+    SampleThreshold = SampleHotCallSiteThreshold;
+
+  // TODO: for small cold functions, we may inlined them and we need to keep
+  // context profile accordingly.
+  if (Candidate.CallsiteCount < ColdCountThreshold)
+    SampleThreshold = SampleColdCallSiteThreshold;
+
+  return (Candidate.SizeCost < SampleThreshold);
+}
+
+void CSPreInliner::processFunction(const StringRef Name) {
+  LLVM_DEBUG(dbgs() << "Process " << Name
+                    << " for context-sensitive pre-inlining\n");
+
+  FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
+  if (!FSamples)
+    return;
+
+  // Use the number of lines/probes as proxy for function size for now.
+  // TODO: retrieve accurate size from dwarf or binary instead.
+  unsigned FuncSize = FSamples->getBodySamples().size();
+  unsigned FuncFinalSize = FuncSize;
+  unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
+  SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
+  SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
+
+  ProfiledCandidateQueue CQueue;
+  getInlineCandidates(CQueue, FSamples);
+
+  while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
+    ProfiledInlineCandidate Candidate = CQueue.top();
+    CQueue.pop();
+    bool ShouldInline = false;
+    if ((ShouldInline = shouldInline(Candidate))) {
+      // We mark context as inlined as the corresponding context profile
+      // won't be merged into that function's base profile.
+      ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
+      Candidate.CalleeSamples->getContext().setAttribute(
+          ContextShouldBeInlined);
+      FuncFinalSize += Candidate.SizeCost;
+      getInlineCandidates(CQueue, Candidate.CalleeSamples);
+    }
+    LLVM_DEBUG(dbgs() << (ShouldInline ? "  Inlined" : "  Outlined")
+                      << " context profile for: "
+                      << Candidate.CalleeSamples->getNameWithContext()
+                      << " (callee size: " << Candidate.SizeCost
+                      << ", call count:" << Candidate.CallsiteCount << ")\n");
+  }
+
+  LLVM_DEBUG({
+    if (!CQueue.empty())
+      dbgs() << "  Inline candidates ignored due to size limit (inliner "
+                "original size: "
+             << FuncSize << ", inliner final size: " << FuncFinalSize
+             << ", size limit: " << SizeLimit << ")\n";
+
+    while (!CQueue.empty()) {
+      ProfiledInlineCandidate Candidate = CQueue.top();
+      CQueue.pop();
+      bool WasInlined =
+          Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
+      dbgs() << "    " << Candidate.CalleeSamples->getNameWithContext()
+             << " (candidate size:" << Candidate.SizeCost
+             << ", call count: " << Candidate.CallsiteCount << ", previously "
+             << (WasInlined ? "inlined)\n" : "not inlined)\n");
+    }
+  });
+}
+
+void CSPreInliner::run() {
+  if (!EnableCSPreInliner)
+    return;
+
+#ifndef NDEBUG
+  auto printProfileNames = [](StringMap<FunctionSamples> &Profiles,
+                              bool IsInput) {
+    dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
+           << Profiles.size() << " total):\n";
+    for (auto &It : Profiles) {
+      const FunctionSamples &Samples = It.second;
+      dbgs() << "  [" << Samples.getNameWithContext() << "] "
+             << Samples.getTotalSamples() << ":" << Samples.getHeadSamples()
+             << "\n";
+    }
+  };
+#endif
+
+  LLVM_DEBUG(printProfileNames(ProfileMap, true));
+
+  // Execute global pre-inliner to estimate a global top-down inline
+  // decision and merge profiles accordingly. This helps with profile
+  // merge for ThinLTO otherwise we won't be able to merge profiles back
+  // to base profile across module/thin-backend boundaries.
+  // It also helps better compress context profile to control profile
+  // size, as we now only need context profile for functions going to
+  // be inlined.
+  for (StringRef FuncName : buildTopDownOrder()) {
+    processFunction(FuncName);
+  }
+
+  // Not inlined context profiles are merged into its base, so we can
+  // trim out such profiles from the output.
+  std::vector<StringRef> ProfilesToBeRemoved;
+  for (auto &It : ProfileMap) {
+    SampleContext Context = It.second.getContext();
+    if (!Context.isBaseContext() && !Context.hasState(InlinedContext)) {
+      assert(Context.hasState(MergedContext) &&
+             "Not inlined context profile should be merged already");
+      ProfilesToBeRemoved.push_back(It.first());
+    }
+  }
+
+  for (StringRef ContextName : ProfilesToBeRemoved) {
+    ProfileMap.erase(ContextName);
+  }
+
+  LLVM_DEBUG(printProfileNames(ProfileMap, false));
+}
diff --git a/tools/llvm-profgen/CSPreInliner.h b/tools/llvm-profgen/CSPreInliner.h
new file mode 100644
index 00000000000..5c65d8fd4a3
--- /dev/null
+++ b/tools/llvm-profgen/CSPreInliner.h
@@ -0,0 +1,92 @@
+//===-- CSPreInliner.h - Profile guided preinliner ---------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H
+#define LLVM_TOOLS_LLVM_PROFGEN_PGOINLINEADVISOR_H
+
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Transforms/IPO/ProfiledCallGraph.h"
+#include "llvm/Transforms/IPO/SampleContextTracker.h"
+
+using namespace llvm;
+using namespace sampleprof;
+
+namespace llvm {
+namespace sampleprof {
+
+// Inline candidate seen from profile
+struct ProfiledInlineCandidate {
+  ProfiledInlineCandidate(const FunctionSamples *Samples, uint64_t Count)
+      : CalleeSamples(Samples), CallsiteCount(Count),
+        SizeCost(Samples->getBodySamples().size()) {}
+  // Context-sensitive function profile for inline candidate
+  const FunctionSamples *CalleeSamples;
+  // Call site count for an inline candidate
+  // TODO: make sure entry count for context profile and call site
+  // target count for corresponding call are consistent.
+  uint64_t CallsiteCount;
+  // Size proxy for function under particular call context.
+  // TODO: use post-inline callee size from debug info.
+  uint64_t SizeCost;
+};
+
+// Inline candidate comparer using call site weight
+struct ProfiledCandidateComparer {
+  bool operator()(const ProfiledInlineCandidate &LHS,
+                  const ProfiledInlineCandidate &RHS) {
+    if (LHS.CallsiteCount != RHS.CallsiteCount)
+      return LHS.CallsiteCount < RHS.CallsiteCount;
+
+    if (LHS.SizeCost != RHS.SizeCost)
+      return LHS.SizeCost > RHS.SizeCost;
+
+    // Tie breaker using GUID so we have stable/deterministic inlining order
+    assert(LHS.CalleeSamples && RHS.CalleeSamples &&
+           "Expect non-null FunctionSamples");
+    return LHS.CalleeSamples->getGUID(LHS.CalleeSamples->getName()) <
+           RHS.CalleeSamples->getGUID(RHS.CalleeSamples->getName());
+  }
+};
+
+using ProfiledCandidateQueue =
+    PriorityQueue<ProfiledInlineCandidate, std::vector<ProfiledInlineCandidate>,
+                  ProfiledCandidateComparer>;
+
+// Pre-compilation inliner based on context-sensitive profile.
+// The PreInliner estimates inline decision using hotness from profile
+// and cost estimation from machine code size. It helps merges context
+// profile globally and achieves better post-inine profile quality, which
+// otherwise won't be possible for ThinLTO. It also reduce context profile
+// size by only keep context that is estimated to be inlined.
+class CSPreInliner {
+public:
+  CSPreInliner(StringMap<FunctionSamples> &Profiles, uint64_t HotThreshold,
+               uint64_t ColdThreshold);
+  void run();
+
+private:
+  bool getInlineCandidates(ProfiledCandidateQueue &CQueue,
+                           const FunctionSamples *FCallerContextSamples);
+  std::vector<StringRef> buildTopDownOrder();
+  void processFunction(StringRef Name);
+  bool shouldInline(ProfiledInlineCandidate &Candidate);
+  SampleContextTracker ContextTracker;
+  StringMap<FunctionSamples> &ProfileMap;
+
+  // Count thresholds to answer isHotCount and isColdCount queries.
+  // Mirrors the threshold in ProfileSummaryInfo.
+  uint64_t HotCountThreshold;
+  uint64_t ColdCountThreshold;
+};
+
+} // end namespace sampleprof
+} // end namespace llvm
+
+#endif
diff --git a/tools/llvm-profgen/ProfileGenerator.cpp b/tools/llvm-profgen/ProfileGenerator.cpp
index b3fb015b672..a6794f01551 100644
--- a/tools/llvm-profgen/ProfileGenerator.cpp
+++ b/tools/llvm-profgen/ProfileGenerator.cpp
@@ -234,9 +234,7 @@ void CSProfileGenerator::generateProfile() {
   // body sample.
   populateInferredFunctionSamples();
 
-  // Compute hot/cold threshold based on profile. This will be used for cold
-  // context profile merging/trimming.
-  computeSummaryAndThreshold();
+  postProcessProfiles();
 }
 
 void CSProfileGenerator::updateBodySamplesforFunctionProfile(
@@ -392,6 +390,20 @@ void CSProfileGenerator::populateInferredFunctionSamples() {
   }
 }
 
+void CSProfileGenerator::postProcessProfiles() {
+  // Compute hot/cold threshold based on profile. This will be used for cold
+  // context profile merging/trimming.
+  computeSummaryAndThreshold();
+
+  // Run global pre-inliner to adjust/merge context profile based on estimated
+  // inline decisions.
+  CSPreInliner(ProfileMap, PSI->getHotCountThreshold(),
+               PSI->getColdCountThreshold())
+      .run();
+
+  mergeAndTrimColdProfile(ProfileMap);
+}
+
 void CSProfileGenerator::computeSummaryAndThreshold() {
   SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
   auto Summary = Builder.computeSummaryForProfiles(ProfileMap);
@@ -451,17 +463,19 @@ void CSProfileGenerator::mergeAndTrimColdProfile(
 
 void CSProfileGenerator::write(std::unique_ptr<SampleProfileWriter> Writer,
                                StringMap<FunctionSamples> &ProfileMap) {
-  mergeAndTrimColdProfile(ProfileMap);
   // Add bracket for context key to support different profile binary format
   StringMap<FunctionSamples> CxtWithBracketPMap;
   for (const auto &Item : ProfileMap) {
-    std::string ContextWithBracket = "[" + Item.first().str() + "]";
+    // After CSPreInliner the key of ProfileMap is no longer accurate for
+    // context, use the context attached to function samples instead.
+    std::string ContextWithBracket =
+        "[" + Item.second.getNameWithContext().str() + "]";
     auto Ret = CxtWithBracketPMap.try_emplace(ContextWithBracket, Item.second);
     assert(Ret.second && "Must be a unique context");
     SampleContext FContext(Ret.first->first(), RawContext);
     FunctionSamples &FProfile = Ret.first->second;
     FContext.setAllAttributes(FProfile.getContext().getAllAttributes());
-    FProfile.setName(FContext.getNameWithContext(true));
+    FProfile.setName(FContext.getNameWithoutContext());
     FProfile.setContext(FContext);
   }
   Writer->write(CxtWithBracketPMap);
@@ -500,9 +514,7 @@ void PseudoProbeCSProfileGenerator::generateProfile() {
     }
   }
 
-  // Compute hot/cold threshold based on profile. This will be used for cold
-  // context profile merging/trimming.
-  computeSummaryAndThreshold();
+  postProcessProfiles();
 }
 
 void PseudoProbeCSProfileGenerator::extractProbesFromRange(
diff --git a/tools/llvm-profgen/ProfileGenerator.h b/tools/llvm-profgen/ProfileGenerator.h
index ff0116fb5c3..0ba884f3afb 100644
--- a/tools/llvm-profgen/ProfileGenerator.h
+++ b/tools/llvm-profgen/ProfileGenerator.h
@@ -8,6 +8,7 @@
 
 #ifndef LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H
 #define LLVM_TOOLS_LLVM_PROGEN_PROFILEGENERATOR_H
+#include "CSPreInliner.h"
 #include "ErrorHandling.h"
 #include "PerfReader.h"
 #include "ProfiledBinary.h"
@@ -178,6 +179,9 @@ protected:
   // Lookup or create FunctionSamples for the context
   FunctionSamples &getFunctionProfileForContext(StringRef ContextId,
                                                 bool WasLeafInlined = false);
+  // Post processing for profiles before writing out, such as mermining
+  // and trimming cold profiles, running preinliner on profiles.
+  void postProcessProfiles();
   // Merge cold context profile whose total sample is below threshold
   // into base profile.
   void mergeAndTrimColdProfile(StringMap<FunctionSamples> &ProfileMap);
@@ -185,6 +189,9 @@ protected:
   void write(std::unique_ptr<SampleProfileWriter> Writer,
              StringMap<FunctionSamples> &ProfileMap) override;
 
+  // Profile summary to answer isHotCount and isColdCount queries.
+  std::unique_ptr<ProfileSummaryInfo> PSI;
+
 private:
   // Helper function for updating body sample for a leaf location in
   // FunctionProfile
@@ -200,9 +207,6 @@ private:
                                        ProfiledBinary *Binary);
   void populateInferredFunctionSamples();
 
-  // Profile summary to answer isHotCount and isColdCount queries.
-  std::unique_ptr<ProfileSummaryInfo> PSI;
-
 public:
   // Deduplicate adjacent repeated context sequences up to a given sequence
   // length. -1 means no size limit.