DFSan-based fuzzer (proof of concept).

Summary: This adds a simple DFSan-based (i.e. taint-guided) fuzzer mutator, see the comments for details. Test Plan: a test added Reviewers: samsonov, pcc Reviewed By: samsonov, pcc Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D8669 llvm-svn: 233613
2024-11-23 03:02:36 +01:00 · 2015-03-30 22:09:51 +00:00 · 2015-03-30 22:09:51 +00:00 · bf919ef6ab
commit bf919ef6ab
parent e77dfb9824
11 changed files with 361 additions and 4 deletions
--- a/lib/Fuzzer/CMakeLists.txt
+++ b/lib/Fuzzer/CMakeLists.txt
@ -1,8 +1,10 @@
-# Disable the coverage instrumentation for the fuzzer itself.
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2 -fsanitize-coverage=0")
+set(LIBFUZZER_FLAGS_BASE "${CMAKE_CXX_FLAGS_RELEASE}")
+# Disable the coverage and sanitizer instrumentation for the fuzzer itself.
+set(CMAKE_CXX_FLAGS_RELEASE "${LIBFUZZER_FLAGS_BASE} -O2 -fno-sanitize=all")
 if( LLVM_USE_SANITIZE_COVERAGE  )
  add_library(LLVMFuzzerNoMain OBJECT
    FuzzerCrossOver.cpp
+    FuzzerDFSan.cpp
    FuzzerDriver.cpp
    FuzzerIO.cpp
    FuzzerLoop.cpp
--- a/lib/Fuzzer/FuzzerDFSan.cpp
+++ b/lib/Fuzzer/FuzzerDFSan.cpp
@ -0,0 +1,275 @@
+//===- FuzzerDFSan.cpp - DFSan-based fuzzer mutator -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// DataFlowSanitizer (DFSan) is a tool for
+// generalised dynamic data flow (taint) analysis:
+// http://clang.llvm.org/docs/DataFlowSanitizer.html .
+//
+// This file implements a mutation algorithm based on taint
+// analysis feedback from DFSan.
+//
+// The approach has some similarity to "Taint-based Directed Whitebox Fuzzing"
+// by Vijay Ganesh & Tim Leek & Martin Rinard:
+// http://dspace.mit.edu/openaccess-disseminate/1721.1/59320,
+// but it uses a full blown LLVM IR taint analysis and separate instrumentation
+// to analyze all of the "attack points" at once.
+//
+// Workflow:
+//   * lib/Fuzzer/Fuzzer*.cpp is compiled w/o any instrumentation.
+//   * The code under test is compiled with DFSan *and* with special extra hooks
+//     that are inserted before dfsan. Currently supported hooks:
+//     - __sanitizer_cov_trace_cmp: inserted before every ICMP instruction,
+//       receives the type, size and arguments of ICMP.
+//   * Every call to HOOK(a,b) is replaced by DFSan with
+//     __dfsw_HOOK(a, b, label(a), label(b)) so that __dfsw_HOOK
+//     gets all the taint labels for the arguments.
+//   * At the Fuzzer startup we assign a unique DFSan label
+//     to every byte of the input string (Fuzzer::CurrentUnit) so that for any
+//     chunk of data we know which input bytes it has derived from.
+//   * The __dfsw_* functions (implemented in this file) record the
+//     parameters (i.e. the application data and the corresponding taint labels)
+//     in a global state.
+//   * Fuzzer::MutateWithDFSan() tries to use the data recorded by __dfsw_*
+//     hooks to guide the fuzzing towards new application states.
+//     For example if 4 bytes of data that derive from input bytes {4,5,6,7}
+//     are compared with a constant 12345 and the comparison always yields
+//     the same result, we try to insert 12345, 12344, 12346 into bytes
+//     {4,5,6,7} of the next fuzzed inputs.
+//
+// This code does not function when DFSan is not linked in.
+// Instead of using ifdefs and thus requiring a separate build of lib/Fuzzer
+// we redeclare the dfsan_* interface functions as weak and check if they
+// are nullptr before calling.
+// If this approach proves to be useful we may add attribute(weak) to the
+// dfsan declarations in dfsan_interface.h
+//
+// This module is in the "proof of concept" stage.
+// It is capable of solving only the simplest puzzles
+// like test/dfsan/DFSanSimpleCmpTest.cpp.
+//===----------------------------------------------------------------------===//
+
+/* Example of manual usage:
+(
+  cd $LLVM/lib/Fuzzer/
+  clang  -fPIC -c -g -O2 -std=c++11 Fuzzer*.cpp
+  clang++ -O0 -std=c++11 -fsanitize-coverage=3  \
+    -mllvm -sanitizer-coverage-experimental-trace-compares=1 \
+    -fsanitize=dataflow -fsanitize-blacklist=./dfsan_fuzzer_abi.list  \
+    test/dfsan/DFSanSimpleCmpTest.cpp Fuzzer*.o
+  ./a.out
+)
+*/
+
+#include "FuzzerInternal.h"
+#include <sanitizer/dfsan_interface.h>
+
+#include <cstring>
+#include <iostream>
+#include <unordered_map>
+
+extern "C" {
+__attribute__((weak))
+dfsan_label dfsan_create_label(const char *desc, void *userdata);
+__attribute__((weak))
+void dfsan_set_label(dfsan_label label, void *addr, size_t size);
+__attribute__((weak))
+void dfsan_add_label(dfsan_label label, void *addr, size_t size);
+__attribute__((weak))
+const struct dfsan_label_info *dfsan_get_label_info(dfsan_label label);
+}  // extern "C"
+
+namespace {
+
+// These values are copied from include/llvm/IR/InstrTypes.h.
+// We do not include the LLVM headers here to remain independent.
+// If these values ever change, an assertion in ComputeCmp will fail.
+enum Predicate {
+  ICMP_EQ = 32,  ///< equal
+  ICMP_NE = 33,  ///< not equal
+  ICMP_UGT = 34, ///< unsigned greater than
+  ICMP_UGE = 35, ///< unsigned greater or equal
+  ICMP_ULT = 36, ///< unsigned less than
+  ICMP_ULE = 37, ///< unsigned less or equal
+  ICMP_SGT = 38, ///< signed greater than
+  ICMP_SGE = 39, ///< signed greater or equal
+  ICMP_SLT = 40, ///< signed less than
+  ICMP_SLE = 41, ///< signed less or equal
+};
+
+template <class U, class S>
+bool ComputeCmp(size_t CmpType, U Arg1, U Arg2) {
+  switch(CmpType) {
+    case ICMP_EQ : return Arg1 == Arg2;
+    case ICMP_NE : return Arg1 != Arg2;
+    case ICMP_UGT: return Arg1 > Arg2;
+    case ICMP_UGE: return Arg1 >= Arg2;
+    case ICMP_ULT: return Arg1 < Arg2;
+    case ICMP_ULE: return Arg1 <= Arg2;
+    case ICMP_SGT: return (S)Arg1 > (S)Arg2;
+    case ICMP_SGE: return (S)Arg1 >= (S)Arg2;
+    case ICMP_SLT: return (S)Arg1 < (S)Arg2;
+    case ICMP_SLE: return (S)Arg1 <= (S)Arg2;
+    default: assert(0 && "unsupported CmpType");
+  }
+  return false;
+}
+
+static bool ComputeCmp(size_t CmpSize, size_t CmpType, uint64_t Arg1,
+                       uint64_t Arg2) {
+  if (CmpSize == 8) return ComputeCmp<uint64_t, int64_t>(CmpType, Arg1, Arg2);
+  if (CmpSize == 4) return ComputeCmp<uint32_t, int32_t>(CmpType, Arg1, Arg2);
+  if (CmpSize == 2) return ComputeCmp<uint16_t, int16_t>(CmpType, Arg1, Arg2);
+  if (CmpSize == 1) return ComputeCmp<uint8_t, int8_t>(CmpType, Arg1, Arg2);
+  assert(0 && "unsupported type size");
+  return true;
+}
+
+// As a simplification we use the range of input bytes instead of a set of input
+// bytes.
+struct LabelRange {
+  uint16_t Beg, End;  // Range is [Beg, End), thus Beg==End is an empty range.
+
+  LabelRange(uint16_t Beg = 0, uint16_t End = 0) : Beg(Beg), End(End) {}
+
+  static LabelRange Join(LabelRange LR1, LabelRange LR2) {
+    if (LR1.Beg == LR1.End) return LR2;
+    if (LR2.Beg == LR2.End) return LR1;
+    return {std::min(LR1.Beg, LR2.Beg), std::max(LR1.End, LR2.End)};
+  }
+  LabelRange &Join(LabelRange LR) {
+    return *this = Join(*this, LR);
+  }
+  static LabelRange Singleton(const dfsan_label_info *LI) {
+    uint16_t Idx = (uint16_t)(uintptr_t)LI->userdata;
+    assert(Idx > 0);
+    return {(uint16_t)(Idx - 1), Idx};
+  }
+};
+
+std::ostream &operator<<(std::ostream &os, const LabelRange &LR) {
+  return os << "[" << LR.Beg << "," << LR.End << ")";
+}
+
+class DFSanState {
+ public:
+   DFSanState(const fuzzer::Fuzzer::FuzzingOptions &Options)
+       : Options(Options) {}
+
+  struct CmpSiteInfo {
+    size_t ResCounters[2] = {0, 0};
+    size_t CmpSize = 0;
+    LabelRange LR;
+    std::unordered_map<uint64_t, size_t> CountedConstants;
+  };
+
+  LabelRange GetLabelRange(dfsan_label L);
+  void DFSanCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
+                        uint64_t Arg1, uint64_t Arg2, dfsan_label L1,
+                        dfsan_label L2);
+  bool Mutate(fuzzer::Unit *U);
+
+ private:
+  std::unordered_map<uintptr_t, CmpSiteInfo> PcToCmpSiteInfoMap;
+  LabelRange LabelRanges[1 << (sizeof(dfsan_label) * 8)] = {};
+  const fuzzer::Fuzzer::FuzzingOptions &Options;
+};
+
+LabelRange DFSanState::GetLabelRange(dfsan_label L) {
+  LabelRange &LR = LabelRanges[L];
+  if (LR.Beg < LR.End || L == 0)
+    return LR;
+  const dfsan_label_info *LI = dfsan_get_label_info(L);
+  if (LI->l1 || LI->l2)
+    return LR = LabelRange::Join(GetLabelRange(LI->l1), GetLabelRange(LI->l2));
+  return LR = LabelRange::Singleton(LI);
+}
+
+void DFSanState::DFSanCmpCallback(uintptr_t PC, size_t CmpSize, size_t CmpType,
+                                  uint64_t Arg1, uint64_t Arg2, dfsan_label L1,
+                                  dfsan_label L2) {
+  if (L1 == 0 && L2 == 0)
+    return;  // Not actionable.
+  if (L1 != 0 && L2 != 0)
+    return;  // Probably still actionable.
+  bool Res = ComputeCmp(CmpSize, CmpType, Arg1, Arg2);
+  CmpSiteInfo &CSI = PcToCmpSiteInfoMap[PC];
+  CSI.CmpSize = CmpSize;
+  CSI.LR.Join(GetLabelRange(L1)).Join(GetLabelRange(L2));
+  if (!L1) CSI.CountedConstants[Arg1]++;
+  if (!L2) CSI.CountedConstants[Arg2]++;
+  size_t Counter = CSI.ResCounters[Res]++;
+
+  if (Options.Verbosity >= 2  &&
+      (Counter & (Counter - 1)) == 0 &&
+      CSI.ResCounters[!Res] == 0)
+    std::cerr << "DFSAN:"
+              << " PC " << std::hex << PC << std::dec
+              << " S " << CmpSize
+              << " T " << CmpType
+              << " A1 " << Arg1 << " A2 " << Arg2 << " R " << Res
+              << " L" << L1 << GetLabelRange(L1)
+              << " L" << L2 << GetLabelRange(L2)
+              << " LR " << CSI.LR
+              << "\n";
+}
+
+bool DFSanState::Mutate(fuzzer::Unit *U) {
+  for (auto &PCToCmp : PcToCmpSiteInfoMap) {
+    auto &CSI = PCToCmp.second;
+    if (CSI.ResCounters[0] * CSI.ResCounters[1] != 0) continue;
+    if (CSI.ResCounters[0] + CSI.ResCounters[1] < 1000) continue;
+    if (CSI.CountedConstants.size() != 1) continue;
+    uintptr_t C = CSI.CountedConstants.begin()->first;
+    if (U->size() >= CSI.CmpSize) {
+      size_t RangeSize = CSI.LR.End - CSI.LR.Beg;
+      size_t Idx = CSI.LR.Beg + rand() % RangeSize;
+      if (Idx + CSI.CmpSize > U->size()) continue;
+      C += rand() % 5 - 2;
+      memcpy(U->data() + Idx, &C, CSI.CmpSize);
+      return true;
+    }
+  }
+  return false;
+}
+
+static DFSanState *DFSan;
+
+}  // namespace
+
+namespace fuzzer {
+
+bool Fuzzer::MutateWithDFSan(Unit *U) {
+  if (!&dfsan_create_label || !DFSan) return false;
+  return DFSan->Mutate(U);
+}
+
+void Fuzzer::InitializeDFSan() {
+  if (!&dfsan_create_label || !Options.UseDFSan) return;
+  DFSan = new DFSanState(Options);
+  CurrentUnit.resize(Options.MaxLen);
+  for (size_t i = 0; i < static_cast<size_t>(Options.MaxLen); i++) {
+    dfsan_label L = dfsan_create_label("input", (void*)(i + 1));
+    // We assume that no one else has called dfsan_create_label before.
+    assert(L == i + 1);
+    dfsan_set_label(L, &CurrentUnit[i], 1);
+  }
+}
+
+}  // namespace fuzzer
+
+extern "C" {
+void __dfsw___sanitizer_cov_trace_cmp(uint64_t SizeAndType, uint64_t Arg1,
+                                      uint64_t Arg2, dfsan_label L0,
+                                      dfsan_label L1, dfsan_label L2) {
+  assert(L0 == 0);
+  uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
+  uint64_t CmpSize = (SizeAndType >> 32) / 8;
+  uint64_t Type = (SizeAndType << 32) >> 32;
+  DFSan->DFSanCmpCallback(PC, CmpSize, Type, Arg1, Arg2, L1, L2);
+}
+}  // extern "C"
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@ -161,6 +161,7 @@ int FuzzerDriver(int argc, char **argv, UserCallback Callback) {
  Options.UseCounters = Flags.use_counters;
  Options.UseFullCoverageSet = Flags.use_full_coverage_set;
  Options.UseCoveragePairs = Flags.use_coverage_pairs;
+  Options.UseDFSan = Flags.dfsan;
  Options.PreferSmallDuringInitialShuffle =
      Flags.prefer_small_during_initial_shuffle;
  if (Flags.runs >= 0)
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@ -44,3 +44,5 @@ FUZZER_FLAG(int, jobs, 0, "Number of jobs to run. If jobs >= 1 we spawn"
                          " with stdout/stderr redirected to fuzz-JOB.log.")
 FUZZER_FLAG(int, workers, 0,
            "Number of simultaneous worker processes to run the jobs.")
+FUZZER_FLAG(int, dfsan, 1, "Use DFSan for taint-guided mutations. No-op unless "
+                           "the DFSan instrumentation was compiled in.")
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@ -51,6 +51,7 @@ class Fuzzer {
    bool UseCounters = false;
    bool UseFullCoverageSet  = false;
    bool UseCoveragePairs = false;
+    bool UseDFSan = false;
    int PreferSmallDuringInitialShuffle = -1;
    size_t MaxNumberOfRuns = ULONG_MAX;
    std::string OutputCorpus;
@ -58,10 +59,12 @@ class Fuzzer {
  Fuzzer(UserCallback Callback, FuzzingOptions Options)
      : Callback(Callback), Options(Options) {
    SetDeathCallback();
+    InitializeDFSan();
  }
  void AddToCorpus(const Unit &U) { Corpus.push_back(U); }
  size_t Loop(size_t NumIterations);
  void ShuffleAndMinimize();
+  void InitializeDFSan();
  size_t CorpusSize() const { return Corpus.size(); }
  void ReadDir(const std::string &Path) {
    ReadDirToVectorOfUnits(Path.c_str(), &Corpus);
@ -86,6 +89,7 @@ class Fuzzer {
  size_t RunOneMaximizeCoveragePairs(const Unit &U);
  void WriteToOutputCorpus(const Unit &U);
  static void WriteToCrash(const Unit &U, const char *Prefix);
+  bool MutateWithDFSan(Unit *U);

  void SetDeathCallback();
  static void DeathCallback();
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@ -192,6 +192,7 @@ size_t Fuzzer::MutateAndTestOne(Unit *U) {
  for (int i = 0; i < Options.MutateDepth; i++) {
    if (TotalNumberOfRuns >= Options.MaxNumberOfRuns)
      return NewUnits;
+    MutateWithDFSan(U);
    Mutate(U, Options.MaxLen);
    size_t NewCoverage = RunOne(*U);
    if (NewCoverage) {
--- a/lib/Fuzzer/dfsan_fuzzer_abi.list
+++ b/lib/Fuzzer/dfsan_fuzzer_abi.list
@ -0,0 +1,12 @@
+# Replaces __sanitizer_cov_trace_cmp with __dfsw___sanitizer_cov_trace_cmp
+fun:__sanitizer_cov_trace_cmp=custom
+fun:__sanitizer_cov_trace_cmp=uninstrumented
+
+# Ignores coverage callbacks.
+fun:__sanitizer_cov=uninstrumented
+fun:__sanitizer_cov=discard
+fun:__sanitizer_cov_module_init=uninstrumented
+fun:__sanitizer_cov_module_init=discard
+
+# Don't add extra parameters to the Fuzzer callback.
+fun:TestOneInput=uninstrumented
--- a/lib/Fuzzer/test/CMakeLists.txt
+++ b/lib/Fuzzer/test/CMakeLists.txt
@ -2,7 +2,7 @@
 # basic blocks and we'll fail to discover the targets.
 # Also enable the coverage instrumentation back (it is disabled
 # for the Fuzzer lib)
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O0 -fsanitize-coverage=4")
+set(CMAKE_CXX_FLAGS_RELEASE "${LIBFUZZER_FLAGS_BASE} -O0 -fsanitize-coverage=4")

 set(Tests
  CounterTest
@ -14,11 +14,14 @@ set(Tests
  TimeoutTest
  )

+set(DFSanTests
+  DFSanSimpleCmpTest
+  )
+
 set(TestBinaries)

 foreach(Test ${Tests})
  add_executable(LLVMFuzzer-${Test}
-    EXCLUDE_FROM_ALL
    ${Test}.cpp
    )
  target_link_libraries(LLVMFuzzer-${Test}
@ -52,6 +55,13 @@ target_link_libraries(LLVMFuzzer-Unittest

 set(TestBinaries ${TestBinaries} LLVMFuzzer-Unittest)

+add_subdirectory(dfsan)
+
+foreach(Test ${DFSanTests})
+  set(TestBinaries ${TestBinaries} LLVMFuzzer-${Test})
+endforeach()
+
+
 set_target_properties(${TestBinaries}
  PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
  )
--- a/lib/Fuzzer/test/dfsan/CMakeLists.txt
+++ b/lib/Fuzzer/test/dfsan/CMakeLists.txt
@ -0,0 +1,17 @@
+# These tests depend on both coverage and dfsan instrumentation.
+
+set(DFSAN_FUZZER_ABI_LIST "${CMAKE_CURRENT_SOURCE_DIR}/../../dfsan_fuzzer_abi.list")
+
+set(CMAKE_CXX_FLAGS_RELEASE
+  "${LIBFUZZER_FLAGS_BASE} -O0 -fno-sanitize=all -fsanitize=dataflow -mllvm -sanitizer-coverage-experimental-trace-compares=1 -fsanitize-blacklist=${DFSAN_FUZZER_ABI_LIST}")
+
+foreach(Test ${DFSanTests})
+  set_source_files_properties(${Test}.cpp PROPERTIES OBJECT_DEPENDS ${DFSAN_FUZZER_ABI_LIST})
+  add_executable(LLVMFuzzer-${Test}
+    ${Test}.cpp
+    )
+  target_link_libraries(LLVMFuzzer-${Test}
+    LLVMFuzzer
+    )
+endforeach()
+
--- a/lib/Fuzzer/test/dfsan/DFSanSimpleCmpTest.cpp
+++ b/lib/Fuzzer/test/dfsan/DFSanSimpleCmpTest.cpp
@ -0,0 +1,30 @@
+// Simple test for a fuzzer. The fuzzer must find several narrow ranges.
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <cstdio>
+
+extern "C" void TestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size < 14) return;
+  uint64_t x = 0;
+  int64_t  y = 0;
+  int z = 0;
+  unsigned short a = 0;
+  memcpy(&x, Data, 8);
+  memcpy(&y, Data + Size - 8, 8);
+  memcpy(&z, Data + Size / 2, sizeof(z));
+  memcpy(&a, Data + Size / 2 + 4, sizeof(a));
+
+  if (x > 1234567890 &&
+      x < 1234567895 &&
+      y >= 987654321 &&
+      y <= 987654325 &&
+      z < -10000 &&
+      z >= -10005 &&
+      z != -10003 &&
+      a == 4242) {
+    fprintf(stderr, "Found the target: size %zd (%zd, %zd, %d, %d), exiting.\n",
+            Size, x, y, z, a);
+    exit(1);
+  }
+}
--- a/lib/Fuzzer/test/fuzzer.test
+++ b/lib/Fuzzer/test/fuzzer.test
@ -20,3 +20,6 @@ FourIndependentBranchesTest: BINGO

 RUN: not ./LLVMFuzzer-CounterTest -use_counters=1 -max_len=6 -seed=1 -timeout=15 2>&1 | FileCheck %s --check-prefix=CounterTest
 CounterTest: BINGO
+
+RUN: not ./LLVMFuzzer-DFSanSimpleCmpTest -seed=1 -timeout=15 2>&1 | FileCheck %s --check-prefix=DFSanSimpleCmpTest
+DFSanSimpleCmpTest: Found the target: