llvm-mirror/unittests/Support/TrigramIndexTest.cpp

//===- TrigramIndexTest.cpp - Unit tests for TrigramIndex -----------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//

#include "llvm/Support/TrigramIndex.h"
#include "llvm/ADT/STLExtras.h"
#include "gtest/gtest.h"

#include <string>
#include <vector>

using namespace llvm;

namespace {

class TrigramIndexTest : public ::testing::Test {
protected:
  std::unique_ptr<TrigramIndex> makeTrigramIndex(
      std::vector<std::string> Rules) {
    std::unique_ptr<TrigramIndex> TI =
        make_unique<TrigramIndex>();
    for (auto &Rule : Rules)
      TI->insert(Rule);
    return TI;
  }
};

TEST_F(TrigramIndexTest, Empty) {
  std::unique_ptr<TrigramIndex> TI =
      makeTrigramIndex({});
  EXPECT_FALSE(TI->isDefeated());
  EXPECT_TRUE(TI->isDefinitelyOut("foo"));
}

TEST_F(TrigramIndexTest, Basic) {
  std::unique_ptr<TrigramIndex> TI =
      makeTrigramIndex({"*hello*", "*wor.d*"});
  EXPECT_FALSE(TI->isDefeated());
  EXPECT_TRUE(TI->isDefinitelyOut("foo"));
}

TEST_F(TrigramIndexTest, NoTrigramsInRules) {
  std::unique_ptr<TrigramIndex> TI =
      makeTrigramIndex({"b.r", "za*az"});
  EXPECT_TRUE(TI->isDefeated());
  EXPECT_FALSE(TI->isDefinitelyOut("foo"));
  EXPECT_FALSE(TI->isDefinitelyOut("bar"));
  EXPECT_FALSE(TI->isDefinitelyOut("zakaz"));
}

TEST_F(TrigramIndexTest, NoTrigramsInARule) {
  std::unique_ptr<TrigramIndex> TI =
      makeTrigramIndex({"*hello*", "*wo.ld*"});
  EXPECT_TRUE(TI->isDefeated());
  EXPECT_FALSE(TI->isDefinitelyOut("foo"));
}

TEST_F(TrigramIndexTest, RepetitiveRule) {
  std::unique_ptr<TrigramIndex> TI =
      makeTrigramIndex({"*bar*bar*bar*bar*bar", "bar*bar"});
  EXPECT_FALSE(TI->isDefeated());
  EXPECT_TRUE(TI->isDefinitelyOut("foo"));
  EXPECT_TRUE(TI->isDefinitelyOut("bar"));
  EXPECT_FALSE(TI->isDefinitelyOut("barbara"));
  EXPECT_FALSE(TI->isDefinitelyOut("bar+bar"));
}

TEST_F(TrigramIndexTest, PopularTrigram) {
  std::unique_ptr<TrigramIndex> TI =
      makeTrigramIndex({"*aaa*", "*aaaa*", "*aaaaa*", "*aaaaa*", "*aaaaaa*"});
  EXPECT_TRUE(TI->isDefeated());
}

TEST_F(TrigramIndexTest, PopularTrigram2) {
  std::unique_ptr<TrigramIndex> TI =
      makeTrigramIndex({"class1.h", "class2.h", "class3.h", "class4.h", "class.h"});
  EXPECT_TRUE(TI->isDefeated());
}

TEST_F(TrigramIndexTest, TooComplicatedRegex) {
  std::unique_ptr<TrigramIndex> TI =
      makeTrigramIndex({"[0-9]+"});
  EXPECT_TRUE(TI->isDefeated());
}

TEST_F(TrigramIndexTest, TooComplicatedRegex2) {
  std::unique_ptr<TrigramIndex> TI =
      makeTrigramIndex({"foo|bar"});
  EXPECT_TRUE(TI->isDefeated());
}

TEST_F(TrigramIndexTest, EscapedSymbols) {
  std::unique_ptr<TrigramIndex> TI =
      makeTrigramIndex({"*c\\+\\+*", "*hello\\\\world*", "a\\tb", "a\\0b"});
  EXPECT_FALSE(TI->isDefeated());
  EXPECT_FALSE(TI->isDefinitelyOut("c++"));
  EXPECT_TRUE(TI->isDefinitelyOut("c\\+\\+"));
  EXPECT_FALSE(TI->isDefinitelyOut("hello\\world"));
  EXPECT_TRUE(TI->isDefinitelyOut("hello\\\\world"));
  EXPECT_FALSE(TI->isDefinitelyOut("atb"));
  EXPECT_TRUE(TI->isDefinitelyOut("a\\tb"));
  EXPECT_TRUE(TI->isDefinitelyOut("a\tb"));
  EXPECT_FALSE(TI->isDefinitelyOut("a0b"));
}

TEST_F(TrigramIndexTest, Backreference1) {
  std::unique_ptr<TrigramIndex> TI =
      makeTrigramIndex({"*foo\\1*"});
  EXPECT_TRUE(TI->isDefeated());
}

TEST_F(TrigramIndexTest, Backreference2) {
  std::unique_ptr<TrigramIndex> TI =
      makeTrigramIndex({"*foo\\2*"});
  EXPECT_TRUE(TI->isDefeated());
}

TEST_F(TrigramIndexTest, Sequence) {
  std::unique_ptr<TrigramIndex> TI =
      makeTrigramIndex({"class1.h", "class2.h", "class3.h", "class4.h"});
  EXPECT_FALSE(TI->isDefeated());
  EXPECT_FALSE(TI->isDefinitelyOut("class1"));
  EXPECT_TRUE(TI->isDefinitelyOut("class.h"));
  EXPECT_TRUE(TI->isDefinitelyOut("class"));
}

}  // namespace
Use trigrams to speed up SpecialCaseList. Summary: it's often the case when the rules in the SpecialCaseList are of the form hel.o*bar. That gives us a chance to build trigram index to quickly discard 99% of inputs without running a full regex. A similar idea was used in Google Code Search as described in the blog post: https://swtch.com/~rsc/regexp/regexp4.html The check is defeated, if there's at least one regex more complicated than that. In this case, all inputs will go through the regex. That said, the real-world rules are often simple or can be simplied. That considerably speeds up compiling Chromium with CFI and UBSan. As measured on Chromium's content_message_generator.cc: before, CFI: 44 s after, CFI: 23 s after, CFI, no blacklist: 23 s (~1% slower, but 3 runs were unable to show the difference) after, regular compilation to bitcode: 23 s Reviewers: pcc Subscribers: mgorny, llvm-commits Differential Revision: https://reviews.llvm.org/D27188 llvm-svn: 288303 2016-12-01 03:54:54 +01:00			`//===- TrigramIndexTest.cpp - Unit tests for TrigramIndex -----------------===//`
			`//`
			`// The LLVM Compiler Infrastructure`
			`//`
			`// This file is distributed under the University of Illinois Open Source`
			`// License. See LICENSE.TXT for details.`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "llvm/Support/TrigramIndex.h"`
Re-sort #include lines for unittests. This uses a slightly modified clang-format (https://reviews.llvm.org/D33932) to keep primary headers at the top and handle new utility headers like 'gmock' consistently with other utility headers. No other change was made. I did no manual edits, all of this is clang-format. This should allow other changes to have more clear and focused diffs, and is especially motivated by moving some headers into more focused libraries. llvm-svn: 304786 2017-06-06 13:06:56 +02:00			`#include "llvm/ADT/STLExtras.h"`
Use trigrams to speed up SpecialCaseList. Summary: it's often the case when the rules in the SpecialCaseList are of the form hel.o*bar. That gives us a chance to build trigram index to quickly discard 99% of inputs without running a full regex. A similar idea was used in Google Code Search as described in the blog post: https://swtch.com/~rsc/regexp/regexp4.html The check is defeated, if there's at least one regex more complicated than that. In this case, all inputs will go through the regex. That said, the real-world rules are often simple or can be simplied. That considerably speeds up compiling Chromium with CFI and UBSan. As measured on Chromium's content_message_generator.cc: before, CFI: 44 s after, CFI: 23 s after, CFI, no blacklist: 23 s (~1% slower, but 3 runs were unable to show the difference) after, regular compilation to bitcode: 23 s Reviewers: pcc Subscribers: mgorny, llvm-commits Differential Revision: https://reviews.llvm.org/D27188 llvm-svn: 288303 2016-12-01 03:54:54 +01:00			`#include "gtest/gtest.h"`

			`#include <string>`
			`#include <vector>`

			`using namespace llvm;`

			`namespace {`

			`class TrigramIndexTest : public ::testing::Test {`
			`protected:`
			`std::unique_ptr<TrigramIndex> makeTrigramIndex(`
			`std::vector<std::string> Rules) {`
			`std::unique_ptr<TrigramIndex> TI =`
			`make_unique<TrigramIndex>();`
			`for (auto &Rule : Rules)`
			`TI->insert(Rule);`
			`return TI;`
			`}`
			`};`

			`TEST_F(TrigramIndexTest, Empty) {`
			`std::unique_ptr<TrigramIndex> TI =`
			`makeTrigramIndex({});`
			`EXPECT_FALSE(TI->isDefeated());`
			`EXPECT_TRUE(TI->isDefinitelyOut("foo"));`
			`}`

			`TEST_F(TrigramIndexTest, Basic) {`
			`std::unique_ptr<TrigramIndex> TI =`
			`makeTrigramIndex({"hello", "wor.d"});`
			`EXPECT_FALSE(TI->isDefeated());`
			`EXPECT_TRUE(TI->isDefinitelyOut("foo"));`
			`}`

			`TEST_F(TrigramIndexTest, NoTrigramsInRules) {`
			`std::unique_ptr<TrigramIndex> TI =`
			`makeTrigramIndex({"b.r", "za*az"});`
			`EXPECT_TRUE(TI->isDefeated());`
			`EXPECT_FALSE(TI->isDefinitelyOut("foo"));`
			`EXPECT_FALSE(TI->isDefinitelyOut("bar"));`
			`EXPECT_FALSE(TI->isDefinitelyOut("zakaz"));`
			`}`

			`TEST_F(TrigramIndexTest, NoTrigramsInARule) {`
			`std::unique_ptr<TrigramIndex> TI =`
			`makeTrigramIndex({"hello", "wo.ld"});`
			`EXPECT_TRUE(TI->isDefeated());`
			`EXPECT_FALSE(TI->isDefinitelyOut("foo"));`
			`}`

			`TEST_F(TrigramIndexTest, RepetitiveRule) {`
			`std::unique_ptr<TrigramIndex> TI =`
			`makeTrigramIndex({"barbarbarbarbar", "barbar"});`
			`EXPECT_FALSE(TI->isDefeated());`
			`EXPECT_TRUE(TI->isDefinitelyOut("foo"));`
			`EXPECT_TRUE(TI->isDefinitelyOut("bar"));`
			`EXPECT_FALSE(TI->isDefinitelyOut("barbara"));`
			`EXPECT_FALSE(TI->isDefinitelyOut("bar+bar"));`
			`}`

			`TEST_F(TrigramIndexTest, PopularTrigram) {`
			`std::unique_ptr<TrigramIndex> TI =`
			`makeTrigramIndex({"aaa", "aaaa", "aaaaa", "aaaaa", "aaaaaa"});`
			`EXPECT_TRUE(TI->isDefeated());`
			`}`

			`TEST_F(TrigramIndexTest, PopularTrigram2) {`
			`std::unique_ptr<TrigramIndex> TI =`
			`makeTrigramIndex({"class1.h", "class2.h", "class3.h", "class4.h", "class.h"});`
			`EXPECT_TRUE(TI->isDefeated());`
			`}`

			`TEST_F(TrigramIndexTest, TooComplicatedRegex) {`
			`std::unique_ptr<TrigramIndex> TI =`
			`makeTrigramIndex({"[0-9]+"});`
			`EXPECT_TRUE(TI->isDefeated());`
			`}`

			`TEST_F(TrigramIndexTest, TooComplicatedRegex2) {`
			`std::unique_ptr<TrigramIndex> TI =`
			`makeTrigramIndex({"foo\|bar"});`
			`EXPECT_TRUE(TI->isDefeated());`
			`}`

Support escaping in TrigramIndex. Summary: This is a follow up to r288303, where I have introduced TrigramIndex to speed up SpecialCaseList for the cases when all rules are simple wildcards, like hellowor.d. Here, I add support for escaping, so that it's possible to specify rules like c\+\+abi*. Reviewers: pcc Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D27318 llvm-svn: 288553 2016-12-03 00:30:16 +01:00			`TEST_F(TrigramIndexTest, EscapedSymbols) {`
Use trigrams to speed up SpecialCaseList. Summary: it's often the case when the rules in the SpecialCaseList are of the form hel.o*bar. That gives us a chance to build trigram index to quickly discard 99% of inputs without running a full regex. A similar idea was used in Google Code Search as described in the blog post: https://swtch.com/~rsc/regexp/regexp4.html The check is defeated, if there's at least one regex more complicated than that. In this case, all inputs will go through the regex. That said, the real-world rules are often simple or can be simplied. That considerably speeds up compiling Chromium with CFI and UBSan. As measured on Chromium's content_message_generator.cc: before, CFI: 44 s after, CFI: 23 s after, CFI, no blacklist: 23 s (~1% slower, but 3 runs were unable to show the difference) after, regular compilation to bitcode: 23 s Reviewers: pcc Subscribers: mgorny, llvm-commits Differential Revision: https://reviews.llvm.org/D27188 llvm-svn: 288303 2016-12-01 03:54:54 +01:00			`std::unique_ptr<TrigramIndex> TI =`
Support escaping in TrigramIndex. Summary: This is a follow up to r288303, where I have introduced TrigramIndex to speed up SpecialCaseList for the cases when all rules are simple wildcards, like hellowor.d. Here, I add support for escaping, so that it's possible to specify rules like c\+\+abi*. Reviewers: pcc Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D27318 llvm-svn: 288553 2016-12-03 00:30:16 +01:00			`makeTrigramIndex({"c\\+\\+", "hello\\\\world", "a\\tb", "a\\0b"});`
			`EXPECT_FALSE(TI->isDefeated());`
			`EXPECT_FALSE(TI->isDefinitelyOut("c++"));`
			`EXPECT_TRUE(TI->isDefinitelyOut("c\\+\\+"));`
			`EXPECT_FALSE(TI->isDefinitelyOut("hello\\world"));`
			`EXPECT_TRUE(TI->isDefinitelyOut("hello\\\\world"));`
			`EXPECT_FALSE(TI->isDefinitelyOut("atb"));`
			`EXPECT_TRUE(TI->isDefinitelyOut("a\\tb"));`
			`EXPECT_TRUE(TI->isDefinitelyOut("a\tb"));`
			`EXPECT_FALSE(TI->isDefinitelyOut("a0b"));`
			`}`

			`TEST_F(TrigramIndexTest, Backreference1) {`
			`std::unique_ptr<TrigramIndex> TI =`
			`makeTrigramIndex({"foo\\1"});`
			`EXPECT_TRUE(TI->isDefeated());`
			`}`

			`TEST_F(TrigramIndexTest, Backreference2) {`
			`std::unique_ptr<TrigramIndex> TI =`
			`makeTrigramIndex({"foo\\2"});`
Use trigrams to speed up SpecialCaseList. Summary: it's often the case when the rules in the SpecialCaseList are of the form hel.o*bar. That gives us a chance to build trigram index to quickly discard 99% of inputs without running a full regex. A similar idea was used in Google Code Search as described in the blog post: https://swtch.com/~rsc/regexp/regexp4.html The check is defeated, if there's at least one regex more complicated than that. In this case, all inputs will go through the regex. That said, the real-world rules are often simple or can be simplied. That considerably speeds up compiling Chromium with CFI and UBSan. As measured on Chromium's content_message_generator.cc: before, CFI: 44 s after, CFI: 23 s after, CFI, no blacklist: 23 s (~1% slower, but 3 runs were unable to show the difference) after, regular compilation to bitcode: 23 s Reviewers: pcc Subscribers: mgorny, llvm-commits Differential Revision: https://reviews.llvm.org/D27188 llvm-svn: 288303 2016-12-01 03:54:54 +01:00			`EXPECT_TRUE(TI->isDefeated());`
			`}`

			`TEST_F(TrigramIndexTest, Sequence) {`
			`std::unique_ptr<TrigramIndex> TI =`
			`makeTrigramIndex({"class1.h", "class2.h", "class3.h", "class4.h"});`
			`EXPECT_FALSE(TI->isDefeated());`
			`EXPECT_FALSE(TI->isDefinitelyOut("class1"));`
			`EXPECT_TRUE(TI->isDefinitelyOut("class.h"));`
			`EXPECT_TRUE(TI->isDefinitelyOut("class"));`
			`}`

			`} // namespace`