//===-- Clustering.cpp ------------------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// #include "Clustering.h" #include #include namespace exegesis { // The clustering problem has the following characteristics: // (A) - Low dimension (dimensions are typically proc resource units, // typically < 10). // (B) - Number of points : ~thousands (points are measurements of an MCInst) // (C) - Number of clusters: ~tens. // (D) - The number of clusters is not known /a priory/. // (E) - The amount of noise is relatively small. // The problem is rather small. In terms of algorithms, (D) disqualifies // k-means and makes algorithms such as DBSCAN[1] or OPTICS[2] more applicable. // // We've used DBSCAN here because it's simple to implement. This is a pretty // straightforward and inefficient implementation of the pseudocode in [2]. // // [1] https://en.wikipedia.org/wiki/DBSCAN // [2] https://en.wikipedia.org/wiki/OPTICS_algorithm // Finds the points at distance less than sqrt(EpsilonSquared) of Q (not // including Q). std::vector InstructionBenchmarkClustering::rangeQuery(const size_t Q) const { std::vector Neighbors; const auto &QMeasurements = Points_[Q].Measurements; for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { if (P == Q) continue; const auto &PMeasurements = Points_[P].Measurements; if (PMeasurements.empty()) // Error point. continue; if (isNeighbour(PMeasurements, QMeasurements)) { Neighbors.push_back(P); } } return Neighbors; } bool InstructionBenchmarkClustering::isNeighbour( const std::vector &P, const std::vector &Q) const { double DistanceSquared = 0.0; for (size_t I = 0, E = P.size(); I < E; ++I) { const auto Diff = P[I].Value - Q[I].Value; DistanceSquared += Diff * Diff; } return DistanceSquared <= EpsilonSquared_; } InstructionBenchmarkClustering::InstructionBenchmarkClustering( const std::vector &Points, const double EpsilonSquared) : Points_(Points), EpsilonSquared_(EpsilonSquared), NoiseCluster_(ClusterId::noise()), ErrorCluster_(ClusterId::error()) {} llvm::Error InstructionBenchmarkClustering::validateAndSetup() { ClusterIdForPoint_.resize(Points_.size()); // Mark erroneous measurements out. // All points must have the same number of dimensions, in the same order. const std::vector *LastMeasurement = nullptr; for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { const auto &Point = Points_[P]; if (!Point.Error.empty()) { ClusterIdForPoint_[P] = ClusterId::error(); ErrorCluster_.PointIndices.push_back(P); continue; } const auto *CurMeasurement = &Point.Measurements; if (LastMeasurement) { if (LastMeasurement->size() != CurMeasurement->size()) { return llvm::make_error( "inconsistent measurement dimensions", llvm::inconvertibleErrorCode()); } for (size_t I = 0, E = LastMeasurement->size(); I < E; ++I) { if (LastMeasurement->at(I).Key != CurMeasurement->at(I).Key) { return llvm::make_error( "inconsistent measurement dimensions keys", llvm::inconvertibleErrorCode()); } } } LastMeasurement = CurMeasurement; } if (LastMeasurement) { NumDimensions_ = LastMeasurement->size(); } return llvm::Error::success(); } void InstructionBenchmarkClustering::dbScan(const size_t MinPts) { for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { if (!ClusterIdForPoint_[P].isUndef()) continue; // Previously processed in inner loop. const auto Neighbors = rangeQuery(P); if (Neighbors.size() + 1 < MinPts) { // Density check. // The region around P is not dense enough to create a new cluster, mark // as noise for now. ClusterIdForPoint_[P] = ClusterId::noise(); continue; } // Create a new cluster, add P. Clusters_.emplace_back(ClusterId::makeValid(Clusters_.size())); Cluster &CurrentCluster = Clusters_.back(); ClusterIdForPoint_[P] = CurrentCluster.Id; /* Label initial point */ CurrentCluster.PointIndices.push_back(P); // Process P's neighbors. std::unordered_set ToProcess(Neighbors.begin(), Neighbors.end()); while (!ToProcess.empty()) { // Retrieve a point from the set. const size_t Q = *ToProcess.begin(); ToProcess.erase(Q); if (ClusterIdForPoint_[Q].isNoise()) { // Change noise point to border point. ClusterIdForPoint_[Q] = CurrentCluster.Id; CurrentCluster.PointIndices.push_back(Q); continue; } if (!ClusterIdForPoint_[Q].isUndef()) { continue; // Previously processed. } // Add Q to the current custer. ClusterIdForPoint_[Q] = CurrentCluster.Id; CurrentCluster.PointIndices.push_back(Q); // And extend to the neighbors of Q if the region is dense enough. const auto Neighbors = rangeQuery(Q); if (Neighbors.size() + 1 >= MinPts) { ToProcess.insert(Neighbors.begin(), Neighbors.end()); } } } // Add noisy points to noise cluster. for (size_t P = 0, NumPoints = Points_.size(); P < NumPoints; ++P) { if (ClusterIdForPoint_[P].isNoise()) { NoiseCluster_.PointIndices.push_back(P); } } } llvm::Expected InstructionBenchmarkClustering::create( const std::vector &Points, const size_t MinPts, const double Epsilon) { InstructionBenchmarkClustering Clustering(Points, Epsilon * Epsilon); if (auto Error = Clustering.validateAndSetup()) { return std::move(Error); } if (Clustering.ErrorCluster_.PointIndices.size() == Points.size()) { return Clustering; // Nothing to cluster. } Clustering.dbScan(MinPts); return Clustering; } } // namespace exegesis