[Support] Harded JSON against invalid UTF-8.

Parsing invalid UTF-8 input is now a parse error. Creating JSON values from invalid UTF-8 now triggers an assertion, and (in no-assert builds) substitutes the unicode replacement character. Strings retrieved from json::Value are always valid UTF-8. llvm-svn: 336657
2024-11-26 04:32:44 +01:00 · 2018-07-10 11:51:26 +00:00 · 2018-07-10 11:51:26 +00:00 · 312ed03a23
commit 312ed03a23
parent d6c8bf565f
4 changed files with 146 additions and 17 deletions
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@ -88,6 +88,17 @@ inline bool isAlpha(char C) {
 /// lowercase letter as classified by "C" locale.
 inline bool isAlnum(char C) { return isAlpha(C) || isDigit(C); }

+/// Checks whether character \p C is valid ASCII (high bit is zero).
+inline bool isASCII(char C) { return static_cast<unsigned char>(C) <= 127; }
+
+/// Checks whether all characters in S are ASCII.
+inline bool isASCII(llvm::StringRef S) {
+  for (char C : S)
+    if (LLVM_UNLIKELY(!isASCII(C)))
+      return false;
+  return true;
+}
+
 /// Returns the corresponding lowercase character if \p x is uppercase.
 inline char toLower(char x) {
  if (x >= 'A' && x <= 'Z')
--- a/include/llvm/Support/JSON.h
+++ b/include/llvm/Support/JSON.h
@ -54,6 +54,30 @@

 namespace llvm {
 namespace json {
+
+// === String encodings ===
+//
+// JSON strings are character sequences (not byte sequences like std::string).
+// We need to know the encoding, and for simplicity only support UTF-8.
+//
+//   - When parsing, invalid UTF-8 is a syntax error like any other
+//
+//   - When creating Values from strings, callers must ensure they are UTF-8.
+//        with asserts on, invalid UTF-8 will crash the program
+//        with asserts off, we'll substitute the replacement character (U+FFFD)
+//     Callers can use json::isUTF8() and json::fixUTF8() for validation.
+//
+//   - When retrieving strings from Values (e.g. asString()), the result will
+//     always be valid UTF-8.
+
+/// Returns true if \p S is valid UTF-8, which is required for use as JSON.
+/// If it returns false, \p Offset is set to a byte offset near the first error.
+bool isUTF8(llvm::StringRef S, size_t *ErrOffset = nullptr);
+/// Replaces invalid UTF-8 sequences in \p S with the replacement character
+/// (U+FFFD). The returned string is valid UTF-8.
+/// This is much slower than isUTF8, so test that first.
+std::string fixUTF8(llvm::StringRef S);
+
 class Array;
 class ObjectKey;
 class Value;
@ -273,16 +297,26 @@ public:
  Value(json::Object &&Properties) : Type(T_Object) {
    create<json::Object>(std::move(Properties));
  }
-  // Strings: types with value semantics.
-  Value(std::string &&V) : Type(T_String) { create<std::string>(std::move(V)); }
-  Value(const std::string &V) : Type(T_String) { create<std::string>(V); }
-  Value(const llvm::SmallVectorImpl<char> &V) : Type(T_String) {
-    create<std::string>(V.begin(), V.end());
+  // Strings: types with value semantics. Must be valid UTF-8.
+  Value(std::string V) : Type(T_String) {
+    if (LLVM_UNLIKELY(!isUTF8(V))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      V = fixUTF8(std::move(V));
+    }
+    create<std::string>(std::move(V));
  }
+  Value(const llvm::SmallVectorImpl<char> &V)
+      : Value(std::string(V.begin(), V.end())){};
  Value(const llvm::formatv_object_base &V) : Value(V.str()){};
-  // Strings: types with reference semantics.
-  Value(llvm::StringRef V) : Type(T_StringRef) { create<llvm::StringRef>(V); }
-  Value(const char *V) : Type(T_StringRef) { create<llvm::StringRef>(V); }
+  // Strings: types with reference semantics. Must be valid UTF-8.
+  Value(StringRef V) : Type(T_StringRef) {
+    create<llvm::StringRef>(V);
+    if (LLVM_UNLIKELY(!isUTF8(V))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      *this = Value(fixUTF8(V));
+    }
+  }
+  Value(const char *V) : Value(StringRef(V)) {}
  Value(std::nullptr_t) : Type(T_Null) {}
  // Boolean (disallow implicit conversions).
  // (The last template parameter is a dummy to keep templates distinct.)
@ -449,13 +483,23 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &, const Value &);
 /// ObjectKey is a used to capture keys in Object. Like Value but:
 ///   - only strings are allowed
 ///   - it's optimized for the string literal case (Owned == nullptr)
+/// Like Value, strings must be UTF-8. See isUTF8 documentation for details.
 class ObjectKey {
 public:
-  ObjectKey(const char *S) : Data(S) {}
-  ObjectKey(llvm::StringRef S) : Data(S) {}
-  ObjectKey(std::string &&V)
-      : Owned(new std::string(std::move(V))), Data(*Owned) {}
-  ObjectKey(const std::string &V) : Owned(new std::string(V)), Data(*Owned) {}
+  ObjectKey(const char *S) : ObjectKey(StringRef(S)) {}
+  ObjectKey(std::string S) : Owned(new std::string(std::move(S))) {
+    if (LLVM_UNLIKELY(!isUTF8(*Owned))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      *Owned = fixUTF8(std::move(*Owned));
+    }
+    Data = *Owned;
+  }
+  ObjectKey(llvm::StringRef S) : Data(S) {
+    if (LLVM_UNLIKELY(!isUTF8(Data))) {
+      assert(false && "Invalid UTF-8 in value used as JSON");
+      *this = ObjectKey(fixUTF8(S));
+    }
+  }
  ObjectKey(const llvm::SmallVectorImpl<char> &V)
      : ObjectKey(std::string(V.begin(), V.end())) {}
  ObjectKey(const llvm::formatv_object_base &V) : ObjectKey(V.str()) {}
--- a/lib/Support/JSON.cpp
+++ b/lib/Support/JSON.cpp
@ -8,6 +8,7 @@
 //===---------------------------------------------------------------------===//

 #include "llvm/Support/JSON.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Format.h"
 #include <cctype>

@ -199,6 +200,14 @@ public:
  Parser(StringRef JSON)
      : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}

+  bool checkUTF8() {
+    size_t ErrOffset;
+    if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
+      return true;
+    P = Start + ErrOffset; // For line/column calculation.
+    return parseError("Invalid UTF-8 sequence");
+  }
+
  bool parseValue(Value &Out);

  bool assertEnd() {
@ -458,7 +467,7 @@ bool Parser::parseUnicode(std::string &Out) {

    // Case 3: it's a leading surrogate. We expect a trailing one next.
    // Case 3a: there's no trailing \u escape. Don't advance in the stream.
-    if (!LLVM_LIKELY(P + 2 <= End && *P == '\\' && *(P + 1) == 'u')) {
+    if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
      Invalid(); // Leading surrogate was unpaired.
      return true;
    }
@ -496,9 +505,10 @@ bool Parser::parseError(const char *Msg) {
 Expected<Value> parse(StringRef JSON) {
  Parser P(JSON);
  Value E = nullptr;
-  if (P.parseValue(E))
-    if (P.assertEnd())
-      return std::move(E);
+  if (P.checkUTF8())
+    if (P.parseValue(E))
+      if (P.assertEnd())
+        return std::move(E);
  return P.takeError();
 }
 char ParseError::ID = 0;
@ -514,6 +524,37 @@ static std::vector<const Object::value_type *> sortedElements(const Object &O) {
  return Elements;
 }

+bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
+  // Fast-path for ASCII, which is valid UTF-8.
+  if (LLVM_LIKELY(isASCII(S)))
+    return true;
+
+  const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
+  if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
+    return true;
+
+  if (ErrOffset)
+    *ErrOffset = Rest - Data;
+  return false;
+}
+
+std::string fixUTF8(llvm::StringRef S) {
+  // This isn't particularly efficient, but is only for error-recovery.
+  std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
+  const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
+  UTF32 *Out32 = Codepoints.data();
+  ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
+                     lenientConversion);
+  Codepoints.resize(Out32 - Codepoints.data());
+  std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
+  const UTF32 *In32 = Codepoints.data();
+  UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
+  ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
+                     strictConversion);
+  Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
+  return Res;
+}
+
 } // namespace json
 } // namespace llvm

--- a/unittests/Support/JSONTest.cpp
+++ b/unittests/Support/JSONTest.cpp
@ -27,6 +27,14 @@ TEST(JSONTest, Types) {
  EXPECT_EQ(R"("foo")", s("foo"));
  EXPECT_EQ("[1,2,3]", s({1, 2, 3}));
  EXPECT_EQ(R"({"x":10,"y":20})", s(Object{{"x", 10}, {"y", 20}}));
+
+#ifdef NDEBUG
+  EXPECT_EQ(R"("<EFBFBD><EFBFBD>")", s("\xC0\x80"));
+  EXPECT_EQ(R"({"<EFBFBD><EFBFBD>":0})", s(Object{{"\xC0\x80", 0}}));
+#else
+  EXPECT_DEATH(s("\xC0\x80"), "Invalid UTF-8");
+  EXPECT_DEATH(s(Object{{"\xC0\x80", 0}}), "Invalid UTF-8");
+#endif
 }

 TEST(JSONTest, Constructors) {
@ -181,6 +189,31 @@ TEST(JSONTest, ParseErrors) {
  "valid": 1,
  invalid: 2
 })");
+  ExpectErr("Invalid UTF-8 sequence", "\"\xC0\x80\""); // WTF-8 null
+}
+
+// Direct tests of isUTF8 and fixUTF8. Internal uses are also tested elsewhere.
+TEST(JSONTest, UTF8) {
+  for (const char *Valid : {
+           "this is ASCII text",
+           "thïs tëxt häs BMP chäräctërs",
+           "𐌶𐌰L𐌾𐍈 C𐍈𐌼𐌴𐍃",
+       }) {
+    EXPECT_TRUE(isUTF8(Valid)) << Valid;
+    EXPECT_EQ(fixUTF8(Valid), Valid);
+  }
+  for (auto Invalid : std::vector<std::pair<const char *, const char *>>{
+           {"lone trailing \x81\x82 bytes", "lone trailing <20><> bytes"},
+           {"missing trailing \xD0 bytes", "missing trailing <20> bytes"},
+           {"truncated character \xD0", "truncated character <20>"},
+           {"not \xC1\x80 the \xE0\x9f\xBF shortest \xF0\x83\x83\x83 encoding",
+            "not <20><> the <20><><EFBFBD> shortest <20><><EFBFBD><EFBFBD> encoding"},
+           {"too \xF9\x80\x80\x80\x80 long", "too <20><><EFBFBD><EFBFBD><EFBFBD> long"},
+           {"surrogate \xED\xA0\x80 invalid \xF4\x90\x80\x80",
+            "surrogate <20><><EFBFBD> invalid <20><><EFBFBD><EFBFBD>"}}) {
+    EXPECT_FALSE(isUTF8(Invalid.first)) << Invalid.first;
+    EXPECT_EQ(fixUTF8(Invalid.first), Invalid.second);
+  }
 }

 TEST(JSONTest, Inspection) {