[PDB] Optimize public symbol processing

Reduces time to link PGO instrumented net_unittets.exe by 11% (9.766s -> 8.672s, best of three). Reduces peak memory by 65.7MB (2142.71MB -> 2076.95MB). Use a more compact struct, BulkPublic, for faster sorting. Sort in parallel. Construct the hash buckets in parallel. Try to use one vector to hold all the publics instead of copying them from one to another. Allocate all the memory needed to serialize publics up front, and then serialize them in place in parallel. Reviewed By: aganea, hans Differential Revision: https://reviews.llvm.org/D79467
2024-11-22 02:33:06 +01:00 · 2020-05-03 09:29:03 -07:00 · 2020-05-03 09:29:03 -07:00 · 75e13669c7
commit 75e13669c7
parent b0c9d2ccf3
2 changed files with 217 additions and 119 deletions
--- a/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h
@ -37,6 +37,7 @@ struct MSFLayout;
 } // namespace msf
 namespace pdb {
 struct GSIHashStreamBuilder;
+struct BulkPublic;

 class GSIStreamBuilder {

@ -55,7 +56,8 @@ public:
  uint32_t getGlobalsStreamIndex() const { return GlobalsStreamIndex; }
  uint32_t getRecordStreamIndex() const { return RecordStreamIndex; }

-  void addPublicSymbol(const codeview::PublicSym32 &Pub);
+  // Add public symbols in bulk.
+  void addPublicSymbols(std::vector<BulkPublic> &&Publics);

  void addGlobalSymbol(const codeview::ProcRefSym &Sym);
  void addGlobalSymbol(const codeview::DataSym &Sym);
@ -75,7 +77,40 @@ private:
  msf::MSFBuilder &Msf;
  std::unique_ptr<GSIHashStreamBuilder> PSH;
  std::unique_ptr<GSIHashStreamBuilder> GSH;
+  std::vector<support::ulittle32_t> PubAddrMap;
 };
+
+/// This struct is equivalent to codeview::PublicSym32, but it has been
+/// optimized for size to speed up bulk serialization and sorting operations
+/// during PDB writing.
+struct BulkPublic {
+  const char *Name = nullptr;
+  uint32_t NameLen = 0;
+
+  // Offset of the symbol record in the publics stream.
+  uint32_t SymOffset = 0;
+
+  // Section offset of the symbol in the image.
+  uint32_t Offset = 0;
+
+  union {
+    // Section index of the section containing the symbol.
+    uint16_t Segment;
+
+    // GSI hash table bucket index.
+    uint16_t BucketIdx;
+  } U{0};
+
+  // PublicSymFlags or hash bucket index
+  uint16_t Flags = 0;
+
+  StringRef getName() const { return StringRef(Name, NameLen); }
+};
+
+static_assert(sizeof(BulkPublic) <= 24, "unexpected size increase");
+static_assert(std::is_trivially_copyable<BulkPublic>::value,
+              "should be trivial");
+
 } // namespace pdb
 } // namespace llvm

--- a/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
@ -5,6 +5,12 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// The data structures defined in this file are based on the reference
+// implementation which is available at
+// https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
+//
+//===----------------------------------------------------------------------===//

 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"

@ -20,6 +26,7 @@
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
 #include "llvm/Support/BinaryItemStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Parallel.h"
 #include "llvm/Support/xxhash.h"
 #include <algorithm>
 #include <vector>
@ -57,24 +64,66 @@ struct llvm::pdb::GSIHashStreamBuilder {
  uint32_t calculateSerializedLength() const;
  uint32_t calculateRecordByteSize() const;
  Error commit(BinaryStreamWriter &Writer);
+
  void finalizeBuckets(uint32_t RecordZeroOffset);

+  // Finalize public symbol buckets.
+  void finalizeBuckets(uint32_t RecordZeroOffset,
+                       std::vector<BulkPublic> &&Publics);
+
  template <typename T> void addSymbol(const T &Symbol, MSFBuilder &Msf) {
    T Copy(Symbol);
    addSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
                                               CodeViewContainer::Pdb));
  }
-  void addSymbol(const CVSymbol &Symbol) {
-    if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
-      auto Iter = SymbolHashes.insert(Symbol);
-      if (!Iter.second)
-        return;
-    }
-
-    Records.push_back(Symbol);
-  }
+  void addSymbol(const CVSymbol &Symbol);
 };

+void GSIHashStreamBuilder::addSymbol(const codeview::CVSymbol &Symbol) {
+  // Ignore duplicate typedefs and constants.
+  if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
+    auto Iter = SymbolHashes.insert(Symbol);
+    if (!Iter.second)
+      return;
+  }
+  Records.push_back(Symbol);
+}
+
+namespace {
+LLVM_PACKED(struct PublicSym32Layout {
+  RecordPrefix Prefix;
+  PublicSym32Header Pub;
+  // char Name[];
+});
+}
+
+// Calculate how much memory this public needs when serialized.
+static uint32_t sizeOfPublic(const BulkPublic &Pub) {
+  uint32_t NameLen = Pub.NameLen;
+  NameLen = std::min(NameLen,
+                     uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
+  return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
+}
+
+static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) {
+  // Assume the caller has allocated sizeOfPublic bytes.
+  uint32_t NameLen = std::min(
+      Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
+  size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
+  assert(Size == sizeOfPublic(Pub));
+  auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem);
+  FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
+  FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2);
+  FixedMem->Pub.Flags = Pub.Flags;
+  FixedMem->Pub.Offset = Pub.Offset;
+  FixedMem->Pub.Segment = Pub.U.Segment;
+  char *NameMem = reinterpret_cast<char *>(FixedMem + 1);
+  memcpy(NameMem, Pub.Name, NameLen);
+  // Zero the null terminator and remaining bytes.
+  memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen);
+  return CVSymbol(makeArrayRef(reinterpret_cast<uint8_t *>(Mem), Size));
+}
+
 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
  uint32_t Size = sizeof(GSIHashHeader);
  Size += HashRecords.size() * sizeof(PSHashRecord);
@ -114,70 +163,97 @@ static bool isAsciiString(StringRef S) {
 }

 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
-static bool gsiRecordLess(StringRef S1, StringRef S2) {
+static int gsiRecordCmp(StringRef S1, StringRef S2) {
  size_t LS = S1.size();
  size_t RS = S2.size();
  // Shorter strings always compare less than longer strings.
  if (LS != RS)
-    return LS < RS;
+    return LS - RS;

  // If either string contains non ascii characters, memcmp them.
  if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
-    return memcmp(S1.data(), S2.data(), LS) < 0;
+    return memcmp(S1.data(), S2.data(), LS);

  // Both strings are ascii, perform a case-insenstive comparison.
-  return S1.compare_lower(S2.data()) < 0;
+  return S1.compare_lower(S2.data());
 }

 void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset) {
-  std::array<std::vector<std::pair<StringRef, PSHashRecord>>, IPHR_HASH + 1>
-      TmpBuckets;
+  // Build up a list of globals to be bucketed. This repurposes the BulkPublic
+  // struct with different meanings for the fields to avoid reallocating a new
+  // vector during public symbol table hash construction.
+  std::vector<BulkPublic> Globals;
+  Globals.resize(Records.size());
  uint32_t SymOffset = RecordZeroOffset;
-  for (const CVSymbol &Sym : Records) {
-    PSHashRecord HR;
-    // Add one when writing symbol offsets to disk. See GSI1::fixSymRecs.
-    HR.Off = SymOffset + 1;
-    HR.CRef = 1; // Always use a refcount of 1.
-
-    // Hash the name to figure out which bucket this goes into.
-    StringRef Name = getSymbolName(Sym);
-    size_t BucketIdx = hashStringV1(Name) % IPHR_HASH;
-    TmpBuckets[BucketIdx].push_back(std::make_pair(Name, HR));
-    SymOffset += Sym.length();
+  for (size_t I = 0, E = Records.size(); I < E; ++I) {
+    StringRef Name = getSymbolName(Records[I]);
+    Globals[I].Name = Name.data();
+    Globals[I].NameLen = Name.size();
+    Globals[I].SymOffset = SymOffset;
+    SymOffset += Records[I].length();
  }

+  finalizeBuckets(RecordZeroOffset, std::move(Globals));
+}
+
+void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset,
+                                           std::vector<BulkPublic> &&Globals) {
+  // Hash every name in parallel. The Segment field is no longer needed, so
+  // store the BucketIdx in a union.
+  parallelForEachN(0, Globals.size(), [&](size_t I) {
+    Globals[I].U.BucketIdx = hashStringV1(Globals[I].Name) % IPHR_HASH;
+  });
+
+  // Parallel sort by bucket index, then name within the buckets. Within the
+  // buckets, sort each bucket by memcmp of the symbol's name.  It's important
+  // that we use the same sorting algorithm as is used by the reference
+  // implementation to ensure that the search for a record within a bucket can
+  // properly early-out when it detects the record won't be found.  The
+  // algorithm used here corredsponds to the function
+  // caseInsensitiveComparePchPchCchCch in the reference implementation.
+  auto BucketCmp = [](const BulkPublic &L, const BulkPublic &R) {
+    if (L.U.BucketIdx != R.U.BucketIdx)
+      return L.U.BucketIdx < R.U.BucketIdx;
+    int Cmp = gsiRecordCmp(L.getName(), R.getName());
+    if (Cmp != 0)
+      return Cmp < 0;
+    // This comparison is necessary to make the sorting stable in the presence
+    // of two static globals with the same name. The easiest way to observe
+    // this is with S_LDATA32 records.
+    return L.SymOffset < R.SymOffset;
+  };
+  parallelSort(Globals, BucketCmp);
+
+  // Zero out the bucket index bitmap.
+  for (ulittle32_t &Word : HashBitmap)
+    Word = 0;
+
  // Compute the three tables: the hash records in bucket and chain order, the
  // bucket presence bitmap, and the bucket chain start offsets.
-  HashRecords.reserve(Records.size());
-  for (ulittle32_t &Word : HashBitmap)
-    Word = 0;
-  for (size_t BucketIdx = 0; BucketIdx < IPHR_HASH + 1; ++BucketIdx) {
-    auto &Bucket = TmpBuckets[BucketIdx];
-    if (Bucket.empty())
-      continue;
-    HashBitmap[BucketIdx / 32] |= 1U << (BucketIdx % 32);
+  HashRecords.reserve(Globals.size());
+  uint32_t LastBucketIdx = ~0U;
+  for (const BulkPublic &Global : Globals) {
+    // If this is a new bucket, add it to the bitmap and the start offset map.
+    uint32_t BucketIdx = Global.U.BucketIdx;
+    if (LastBucketIdx != BucketIdx) {
+      HashBitmap[BucketIdx / 32] |= 1U << (BucketIdx % 32);

-    // Calculate what the offset of the first hash record in the chain would
-    // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
-    // each record would be 12 bytes. See HROffsetCalc in gsi.h.
-    const int SizeOfHROffsetCalc = 12;
-    ulittle32_t ChainStartOff =
-        ulittle32_t(HashRecords.size() * SizeOfHROffsetCalc);
-    HashBuckets.push_back(ChainStartOff);
+      // Calculate what the offset of the first hash record in the chain would
+      // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
+      // each record would be 12 bytes. See HROffsetCalc in gsi.h.
+      const int SizeOfHROffsetCalc = 12;
+      ulittle32_t ChainStartOff =
+          ulittle32_t(HashRecords.size() * SizeOfHROffsetCalc);
+      HashBuckets.push_back(ChainStartOff);
+      LastBucketIdx = BucketIdx;
+    }

-    // Sort each bucket by memcmp of the symbol's name.  It's important that
-    // we use the same sorting algorithm as is used by the reference
-    // implementation to ensure that the search for a record within a bucket
-    // can properly early-out when it detects the record won't be found.  The
-    // algorithm used here corredsponds to the function
-    // caseInsensitiveComparePchPchCchCch in the reference implementation.
-    llvm::sort(Bucket, [](const std::pair<StringRef, PSHashRecord> &Left,
-                          const std::pair<StringRef, PSHashRecord> &Right) {
-      return gsiRecordLess(Left.first, Right.first);
-    });
-
-    for (const auto &Entry : Bucket)
-      HashRecords.push_back(Entry.second);
+    // Create the hash record. Add one when writing symbol offsets to disk.
+    // See GSI1::fixSymRecs. Always use a refcount of 1 for now.
+    PSHashRecord HRec;
+    HRec.Off = Global.SymOffset + 1;
+    HRec.CRef = 1;
+    HashRecords.push_back(HRec);
  }
 }

@ -191,7 +267,7 @@ uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
  uint32_t Size = 0;
  Size += sizeof(PublicsStreamHeader);
  Size += PSH->calculateSerializedLength();
-  Size += PSH->Records.size() * sizeof(uint32_t); // AddrMap
+  Size += PubAddrMap.size() * sizeof(uint32_t); // AddrMap
  // FIXME: Add thunk map and section offsets for incremental linking.

  return Size;
@ -203,11 +279,9 @@ uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {

 Error GSIStreamBuilder::finalizeMsfLayout() {
  // First we write public symbol records, then we write global symbol records.
-  uint32_t PSHZero = 0;
-  uint32_t GSHZero = PSH->calculateRecordByteSize();
-
-  PSH->finalizeBuckets(PSHZero);
-  GSH->finalizeBuckets(GSHZero);
+  uint32_t PublicsSize = PSH->calculateRecordByteSize();
+  uint32_t GlobalsSize = GSH->calculateRecordByteSize();
+  GSH->finalizeBuckets(PublicsSize);

  Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());
  if (!Idx)
@ -219,8 +293,7 @@ Error GSIStreamBuilder::finalizeMsfLayout() {
    return Idx.takeError();
  PublicsStreamIndex = *Idx;

-  uint32_t RecordBytes =
-      GSH->calculateRecordByteSize() + PSH->calculateRecordByteSize();
+  uint32_t RecordBytes = PublicsSize + GlobalsSize;

  Idx = Msf.addStream(RecordBytes);
  if (!Idx)
@ -229,65 +302,56 @@ Error GSIStreamBuilder::finalizeMsfLayout() {
  return Error::success();
 }

-static StringRef extractPubSym(const CVSymbol *Sym, uint16_t &Seg,
-                               uint32_t &Offset) {
-  ArrayRef<uint8_t> Buf = Sym->content();
-  assert(Buf.size() > sizeof(PublicSym32Header));
-  const auto *Hdr = reinterpret_cast<const PublicSym32Header *>(Buf.data());
-  Buf = Buf.drop_front(sizeof(PublicSym32Header));
-  Seg = Hdr->Segment;
-  Offset = Hdr->Offset;
-  // Don't worry about finding the null terminator, since the strings will be
-  // compared later.
-  return StringRef(reinterpret_cast<const char *>(Buf.data()), Buf.size());
-}
-
-static bool comparePubSymByAddrAndName(const CVSymbol *LS, const CVSymbol *RS) {
-  uint16_t LSeg, RSeg;
-  uint32_t LOff, ROff;
-  StringRef LName, RName;
-  LName = extractPubSym(LS, LSeg, LOff);
-  RName = extractPubSym(RS, RSeg, ROff);
-  if (LSeg != RSeg)
-    return LSeg < RSeg;
-  if (LOff != ROff)
-    return LOff < ROff;
-  return LName < RName;
-}
-
-/// Compute the address map. The address map is an array of symbol offsets
-/// sorted so that it can be binary searched by address.
-static std::vector<ulittle32_t> computeAddrMap(ArrayRef<CVSymbol> Records) {
-  // Make a vector of pointers to the symbols so we can sort it by address.
-  // Also gather the symbol offsets while we're at it.
-
-  std::vector<const CVSymbol *> PublicsByAddr;
-  std::vector<uint32_t> SymOffsets;
-  PublicsByAddr.reserve(Records.size());
-  SymOffsets.reserve(Records.size());
+void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&Publics) {
+  // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
+  parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) {
+    return L.getName() < R.getName();
+  });

+  // Assign offsets and allocate one contiguous block of memory for all public
+  // symbols.
  uint32_t SymOffset = 0;
-  for (const CVSymbol &Sym : Records) {
-    assert(Sym.kind() == SymbolKind::S_PUB32);
-    PublicsByAddr.push_back(&Sym);
-    SymOffsets.push_back(SymOffset);
-    SymOffset += Sym.length();
+  for (BulkPublic &Pub : Publics) {
+    Pub.SymOffset = SymOffset;
+    SymOffset += sizeOfPublic(Pub);
  }
-  llvm::stable_sort(PublicsByAddr, comparePubSymByAddrAndName);
+  uint8_t *Mem =
+      reinterpret_cast<uint8_t *>(Msf.getAllocator().Allocate(SymOffset, 4));
+
+  // Instead of storing individual CVSymbol records, store them as one giant
+  // buffer.
+  // FIXME: This is kind of a hack. This makes Records.size() wrong, and we have
+  // to account for that elsewhere.
+  PSH->Records.push_back(CVSymbol(makeArrayRef(Mem, SymOffset)));
+
+  // Serialize them in parallel.
+  parallelForEachN(0, Publics.size(), [&](size_t I) {
+    const BulkPublic &Pub = Publics[I];
+    serializePublic(Mem + Pub.SymOffset, Pub);
+  });
+
+  // Re-sort the publics by address so we can build the address map. We no
+  // longer need the original ordering.
+  auto AddrCmp = [](const BulkPublic &L, const BulkPublic &R) {
+    if (L.U.Segment != R.U.Segment)
+      return L.U.Segment < R.U.Segment;
+    if (L.Offset != R.Offset)
+      return L.Offset < R.Offset;
+    // parallelSort is unstable, so we have to do name comparison to ensure
+    // that two names for the same location come out in a determinstic order.
+    return L.getName() < R.getName();
+  };
+  parallelSort(Publics, AddrCmp);

  // Fill in the symbol offsets in the appropriate order.
-  std::vector<ulittle32_t> AddrMap;
-  AddrMap.reserve(Records.size());
-  for (const CVSymbol *Sym : PublicsByAddr) {
-    ptrdiff_t Idx = std::distance(Records.data(), Sym);
-    assert(Idx >= 0 && size_t(Idx) < Records.size());
-    AddrMap.push_back(ulittle32_t(SymOffsets[Idx]));
-  }
-  return AddrMap;
-}
+  PubAddrMap.reserve(Publics.size());
+  for (const BulkPublic &Pub : Publics)
+    PubAddrMap.push_back(ulittle32_t(Pub.SymOffset));

-void GSIStreamBuilder::addPublicSymbol(const PublicSym32 &Pub) {
-  PSH->addSymbol(Pub, Msf);
+  // Finalize public symbol buckets immediately after they have been added.
+  // They should all be warm in the cache at this point, so go ahead and do it
+  // now.
+  PSH->finalizeBuckets(0, std::move(Publics));
 }

 void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
@ -336,7 +400,7 @@ Error GSIStreamBuilder::commitPublicsHashStream(

  // FIXME: Fill these in. They are for incremental linking.
  Header.SymHash = PSH->calculateSerializedLength();
-  Header.AddrMap = PSH->Records.size() * 4;
+  Header.AddrMap = PubAddrMap.size() * 4;
  Header.NumThunks = 0;
  Header.SizeOfThunk = 0;
  Header.ISectThunkTable = 0;
@ -349,8 +413,7 @@ Error GSIStreamBuilder::commitPublicsHashStream(
  if (auto EC = PSH->commit(Writer))
    return EC;

-  std::vector<ulittle32_t> AddrMap = computeAddrMap(PSH->Records);
-  if (auto EC = Writer.writeArray(makeArrayRef(AddrMap)))
+  if (auto EC = Writer.writeArray(makeArrayRef(PubAddrMap)))
    return EC;

  return Error::success();