1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-20 03:23:01 +02:00
llvm-mirror/lib/Support/JSON.cpp
Sam McCall def984827a [Support] Add JSON streaming output API, faster where the heavy value types aren't needed.
Summary:
There's still a little bit of constant factor that could be trimmed (e.g.
more overloads to avoid round-tripping primitives through json::Value).
But this solves the memory scaling problem, and greatly improves the performance
constant factor, and the API should leave room for optimization if needed.

Adapt TimeProfiler to use it, eliminating almost all the performance regression
from r358476.

Performance test on my machine:
perf stat -r 5 ~/llvmbuild-opt/bin/clang++ -w -S -ftime-trace -mllvm -time-trace-granularity=0 spirit.cpp

Handcrafted JSON (HEAD=r358532 with r358476 reverted): 2480ms
json::Value (HEAD): 2757ms (+11%)
After this patch: 2520 ms (+1.6%)

Reviewers: anton-afanasyev, lebedev.ri

Subscribers: kristina, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D60804

llvm-svn: 359186
2019-04-25 12:51:42 +00:00

719 lines
19 KiB
C++

//=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===---------------------------------------------------------------------===//
#include "llvm/Support/JSON.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/Format.h"
#include <cctype>
namespace llvm {
namespace json {
Value &Object::operator[](const ObjectKey &K) {
return try_emplace(K, nullptr).first->getSecond();
}
Value &Object::operator[](ObjectKey &&K) {
return try_emplace(std::move(K), nullptr).first->getSecond();
}
Value *Object::get(StringRef K) {
auto I = find(K);
if (I == end())
return nullptr;
return &I->second;
}
const Value *Object::get(StringRef K) const {
auto I = find(K);
if (I == end())
return nullptr;
return &I->second;
}
llvm::Optional<std::nullptr_t> Object::getNull(StringRef K) const {
if (auto *V = get(K))
return V->getAsNull();
return llvm::None;
}
llvm::Optional<bool> Object::getBoolean(StringRef K) const {
if (auto *V = get(K))
return V->getAsBoolean();
return llvm::None;
}
llvm::Optional<double> Object::getNumber(StringRef K) const {
if (auto *V = get(K))
return V->getAsNumber();
return llvm::None;
}
llvm::Optional<int64_t> Object::getInteger(StringRef K) const {
if (auto *V = get(K))
return V->getAsInteger();
return llvm::None;
}
llvm::Optional<llvm::StringRef> Object::getString(StringRef K) const {
if (auto *V = get(K))
return V->getAsString();
return llvm::None;
}
const json::Object *Object::getObject(StringRef K) const {
if (auto *V = get(K))
return V->getAsObject();
return nullptr;
}
json::Object *Object::getObject(StringRef K) {
if (auto *V = get(K))
return V->getAsObject();
return nullptr;
}
const json::Array *Object::getArray(StringRef K) const {
if (auto *V = get(K))
return V->getAsArray();
return nullptr;
}
json::Array *Object::getArray(StringRef K) {
if (auto *V = get(K))
return V->getAsArray();
return nullptr;
}
bool operator==(const Object &LHS, const Object &RHS) {
if (LHS.size() != RHS.size())
return false;
for (const auto &L : LHS) {
auto R = RHS.find(L.first);
if (R == RHS.end() || L.second != R->second)
return false;
}
return true;
}
Array::Array(std::initializer_list<Value> Elements) {
V.reserve(Elements.size());
for (const Value &V : Elements) {
emplace_back(nullptr);
back().moveFrom(std::move(V));
}
}
Value::Value(std::initializer_list<Value> Elements)
: Value(json::Array(Elements)) {}
void Value::copyFrom(const Value &M) {
Type = M.Type;
switch (Type) {
case T_Null:
case T_Boolean:
case T_Double:
case T_Integer:
memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
break;
case T_StringRef:
create<StringRef>(M.as<StringRef>());
break;
case T_String:
create<std::string>(M.as<std::string>());
break;
case T_Object:
create<json::Object>(M.as<json::Object>());
break;
case T_Array:
create<json::Array>(M.as<json::Array>());
break;
}
}
void Value::moveFrom(const Value &&M) {
Type = M.Type;
switch (Type) {
case T_Null:
case T_Boolean:
case T_Double:
case T_Integer:
memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
break;
case T_StringRef:
create<StringRef>(M.as<StringRef>());
break;
case T_String:
create<std::string>(std::move(M.as<std::string>()));
M.Type = T_Null;
break;
case T_Object:
create<json::Object>(std::move(M.as<json::Object>()));
M.Type = T_Null;
break;
case T_Array:
create<json::Array>(std::move(M.as<json::Array>()));
M.Type = T_Null;
break;
}
}
void Value::destroy() {
switch (Type) {
case T_Null:
case T_Boolean:
case T_Double:
case T_Integer:
break;
case T_StringRef:
as<StringRef>().~StringRef();
break;
case T_String:
as<std::string>().~basic_string();
break;
case T_Object:
as<json::Object>().~Object();
break;
case T_Array:
as<json::Array>().~Array();
break;
}
}
bool operator==(const Value &L, const Value &R) {
if (L.kind() != R.kind())
return false;
switch (L.kind()) {
case Value::Null:
return *L.getAsNull() == *R.getAsNull();
case Value::Boolean:
return *L.getAsBoolean() == *R.getAsBoolean();
case Value::Number:
// Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=323
// The same integer must convert to the same double, per the standard.
// However we see 64-vs-80-bit precision comparisons with gcc-7 -O3 -m32.
// So we avoid floating point promotion for exact comparisons.
if (L.Type == Value::T_Integer || R.Type == Value::T_Integer)
return L.getAsInteger() == R.getAsInteger();
return *L.getAsNumber() == *R.getAsNumber();
case Value::String:
return *L.getAsString() == *R.getAsString();
case Value::Array:
return *L.getAsArray() == *R.getAsArray();
case Value::Object:
return *L.getAsObject() == *R.getAsObject();
}
llvm_unreachable("Unknown value kind");
}
namespace {
// Simple recursive-descent JSON parser.
class Parser {
public:
Parser(StringRef JSON)
: Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
bool checkUTF8() {
size_t ErrOffset;
if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
return true;
P = Start + ErrOffset; // For line/column calculation.
return parseError("Invalid UTF-8 sequence");
}
bool parseValue(Value &Out);
bool assertEnd() {
eatWhitespace();
if (P == End)
return true;
return parseError("Text after end of document");
}
Error takeError() {
assert(Err);
return std::move(*Err);
}
private:
void eatWhitespace() {
while (P != End && (*P == ' ' || *P == '\r' || *P == '\n' || *P == '\t'))
++P;
}
// On invalid syntax, parseX() functions return false and set Err.
bool parseNumber(char First, Value &Out);
bool parseString(std::string &Out);
bool parseUnicode(std::string &Out);
bool parseError(const char *Msg); // always returns false
char next() { return P == End ? 0 : *P++; }
char peek() { return P == End ? 0 : *P; }
static bool isNumber(char C) {
return C == '0' || C == '1' || C == '2' || C == '3' || C == '4' ||
C == '5' || C == '6' || C == '7' || C == '8' || C == '9' ||
C == 'e' || C == 'E' || C == '+' || C == '-' || C == '.';
}
Optional<Error> Err;
const char *Start, *P, *End;
};
bool Parser::parseValue(Value &Out) {
eatWhitespace();
if (P == End)
return parseError("Unexpected EOF");
switch (char C = next()) {
// Bare null/true/false are easy - first char identifies them.
case 'n':
Out = nullptr;
return (next() == 'u' && next() == 'l' && next() == 'l') ||
parseError("Invalid JSON value (null?)");
case 't':
Out = true;
return (next() == 'r' && next() == 'u' && next() == 'e') ||
parseError("Invalid JSON value (true?)");
case 'f':
Out = false;
return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
parseError("Invalid JSON value (false?)");
case '"': {
std::string S;
if (parseString(S)) {
Out = std::move(S);
return true;
}
return false;
}
case '[': {
Out = Array{};
Array &A = *Out.getAsArray();
eatWhitespace();
if (peek() == ']') {
++P;
return true;
}
for (;;) {
A.emplace_back(nullptr);
if (!parseValue(A.back()))
return false;
eatWhitespace();
switch (next()) {
case ',':
eatWhitespace();
continue;
case ']':
return true;
default:
return parseError("Expected , or ] after array element");
}
}
}
case '{': {
Out = Object{};
Object &O = *Out.getAsObject();
eatWhitespace();
if (peek() == '}') {
++P;
return true;
}
for (;;) {
if (next() != '"')
return parseError("Expected object key");
std::string K;
if (!parseString(K))
return false;
eatWhitespace();
if (next() != ':')
return parseError("Expected : after object key");
eatWhitespace();
if (!parseValue(O[std::move(K)]))
return false;
eatWhitespace();
switch (next()) {
case ',':
eatWhitespace();
continue;
case '}':
return true;
default:
return parseError("Expected , or } after object property");
}
}
}
default:
if (isNumber(C))
return parseNumber(C, Out);
return parseError("Invalid JSON value");
}
}
bool Parser::parseNumber(char First, Value &Out) {
// Read the number into a string. (Must be null-terminated for strto*).
SmallString<24> S;
S.push_back(First);
while (isNumber(peek()))
S.push_back(next());
char *End;
// Try first to parse as integer, and if so preserve full 64 bits.
// strtoll returns long long >= 64 bits, so check it's in range too.
auto I = std::strtoll(S.c_str(), &End, 10);
if (End == S.end() && I >= std::numeric_limits<int64_t>::min() &&
I <= std::numeric_limits<int64_t>::max()) {
Out = int64_t(I);
return true;
}
// If it's not an integer
Out = std::strtod(S.c_str(), &End);
return End == S.end() || parseError("Invalid JSON value (number?)");
}
bool Parser::parseString(std::string &Out) {
// leading quote was already consumed.
for (char C = next(); C != '"'; C = next()) {
if (LLVM_UNLIKELY(P == End))
return parseError("Unterminated string");
if (LLVM_UNLIKELY((C & 0x1f) == C))
return parseError("Control character in string");
if (LLVM_LIKELY(C != '\\')) {
Out.push_back(C);
continue;
}
// Handle escape sequence.
switch (C = next()) {
case '"':
case '\\':
case '/':
Out.push_back(C);
break;
case 'b':
Out.push_back('\b');
break;
case 'f':
Out.push_back('\f');
break;
case 'n':
Out.push_back('\n');
break;
case 'r':
Out.push_back('\r');
break;
case 't':
Out.push_back('\t');
break;
case 'u':
if (!parseUnicode(Out))
return false;
break;
default:
return parseError("Invalid escape sequence");
}
}
return true;
}
static void encodeUtf8(uint32_t Rune, std::string &Out) {
if (Rune < 0x80) {
Out.push_back(Rune & 0x7F);
} else if (Rune < 0x800) {
uint8_t FirstByte = 0xC0 | ((Rune & 0x7C0) >> 6);
uint8_t SecondByte = 0x80 | (Rune & 0x3F);
Out.push_back(FirstByte);
Out.push_back(SecondByte);
} else if (Rune < 0x10000) {
uint8_t FirstByte = 0xE0 | ((Rune & 0xF000) >> 12);
uint8_t SecondByte = 0x80 | ((Rune & 0xFC0) >> 6);
uint8_t ThirdByte = 0x80 | (Rune & 0x3F);
Out.push_back(FirstByte);
Out.push_back(SecondByte);
Out.push_back(ThirdByte);
} else if (Rune < 0x110000) {
uint8_t FirstByte = 0xF0 | ((Rune & 0x1F0000) >> 18);
uint8_t SecondByte = 0x80 | ((Rune & 0x3F000) >> 12);
uint8_t ThirdByte = 0x80 | ((Rune & 0xFC0) >> 6);
uint8_t FourthByte = 0x80 | (Rune & 0x3F);
Out.push_back(FirstByte);
Out.push_back(SecondByte);
Out.push_back(ThirdByte);
Out.push_back(FourthByte);
} else {
llvm_unreachable("Invalid codepoint");
}
}
// Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
// May parse several sequential escapes to ensure proper surrogate handling.
// We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
// These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
bool Parser::parseUnicode(std::string &Out) {
// Invalid UTF is not a JSON error (RFC 8529§8.2). It gets replaced by U+FFFD.
auto Invalid = [&] { Out.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
// Decodes 4 hex digits from the stream into Out, returns false on error.
auto Parse4Hex = [this](uint16_t &Out) -> bool {
Out = 0;
char Bytes[] = {next(), next(), next(), next()};
for (unsigned char C : Bytes) {
if (!std::isxdigit(C))
return parseError("Invalid \\u escape sequence");
Out <<= 4;
Out |= (C > '9') ? (C & ~0x20) - 'A' + 10 : (C - '0');
}
return true;
};
uint16_t First; // UTF-16 code unit from the first \u escape.
if (!Parse4Hex(First))
return false;
// We loop to allow proper surrogate-pair error handling.
while (true) {
// Case 1: the UTF-16 code unit is already a codepoint in the BMP.
if (LLVM_LIKELY(First < 0xD800 || First >= 0xE000)) {
encodeUtf8(First, Out);
return true;
}
// Case 2: it's an (unpaired) trailing surrogate.
if (LLVM_UNLIKELY(First >= 0xDC00)) {
Invalid();
return true;
}
// Case 3: it's a leading surrogate. We expect a trailing one next.
// Case 3a: there's no trailing \u escape. Don't advance in the stream.
if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
Invalid(); // Leading surrogate was unpaired.
return true;
}
P += 2;
uint16_t Second;
if (!Parse4Hex(Second))
return false;
// Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
if (LLVM_UNLIKELY(Second < 0xDC00 || Second >= 0xE000)) {
Invalid(); // Leading surrogate was unpaired.
First = Second; // Second escape still needs to be processed.
continue;
}
// Case 3c: a valid surrogate pair encoding an astral codepoint.
encodeUtf8(0x10000 | ((First - 0xD800) << 10) | (Second - 0xDC00), Out);
return true;
}
}
bool Parser::parseError(const char *Msg) {
int Line = 1;
const char *StartOfLine = Start;
for (const char *X = Start; X < P; ++X) {
if (*X == 0x0A) {
++Line;
StartOfLine = X + 1;
}
}
Err.emplace(
llvm::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start));
return false;
}
} // namespace
Expected<Value> parse(StringRef JSON) {
Parser P(JSON);
Value E = nullptr;
if (P.checkUTF8())
if (P.parseValue(E))
if (P.assertEnd())
return std::move(E);
return P.takeError();
}
char ParseError::ID = 0;
static std::vector<const Object::value_type *> sortedElements(const Object &O) {
std::vector<const Object::value_type *> Elements;
for (const auto &E : O)
Elements.push_back(&E);
llvm::sort(Elements,
[](const Object::value_type *L, const Object::value_type *R) {
return L->first < R->first;
});
return Elements;
}
bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
// Fast-path for ASCII, which is valid UTF-8.
if (LLVM_LIKELY(isASCII(S)))
return true;
const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
return true;
if (ErrOffset)
*ErrOffset = Rest - Data;
return false;
}
std::string fixUTF8(llvm::StringRef S) {
// This isn't particularly efficient, but is only for error-recovery.
std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
UTF32 *Out32 = Codepoints.data();
ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
lenientConversion);
Codepoints.resize(Out32 - Codepoints.data());
std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
const UTF32 *In32 = Codepoints.data();
UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
strictConversion);
Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
return Res;
}
static void quote(llvm::raw_ostream &OS, llvm::StringRef S) {
OS << '\"';
for (unsigned char C : S) {
if (C == 0x22 || C == 0x5C)
OS << '\\';
if (C >= 0x20) {
OS << C;
continue;
}
OS << '\\';
switch (C) {
// A few characters are common enough to make short escapes worthwhile.
case '\t':
OS << 't';
break;
case '\n':
OS << 'n';
break;
case '\r':
OS << 'r';
break;
default:
OS << 'u';
llvm::write_hex(OS, C, llvm::HexPrintStyle::Lower, 4);
break;
}
}
OS << '\"';
}
void llvm::json::OStream::value(const Value &V) {
switch (V.kind()) {
case Value::Null:
valueBegin();
OS << "null";
return;
case Value::Boolean:
valueBegin();
OS << (*V.getAsBoolean() ? "true" : "false");
return;
case Value::Number:
valueBegin();
if (V.Type == Value::T_Integer)
OS << *V.getAsInteger();
else
OS << format("%.*g", std::numeric_limits<double>::max_digits10,
*V.getAsNumber());
return;
case Value::String:
valueBegin();
quote(OS, *V.getAsString());
return;
case Value::Array:
return array([&] {
for (const Value &E : *V.getAsArray())
value(E);
});
case Value::Object:
return object([&] {
for (const Object::value_type *E : sortedElements(*V.getAsObject()))
attribute(E->first, E->second);
});
}
}
void llvm::json::OStream::valueBegin() {
assert(Stack.back().Ctx != Object && "Only attributes allowed here");
if (Stack.back().HasValue) {
assert(Stack.back().Ctx != Singleton && "Only one value allowed here");
OS << ',';
}
if (Stack.back().Ctx == Array)
newline();
Stack.back().HasValue = true;
}
void llvm::json::OStream::newline() {
if (IndentSize) {
OS.write('\n');
OS.indent(Indent);
}
}
void llvm::json::OStream::arrayBegin() {
valueBegin();
Stack.emplace_back();
Stack.back().Ctx = Array;
Indent += IndentSize;
OS << '[';
}
void llvm::json::OStream::arrayEnd() {
assert(Stack.back().Ctx == Array);
Indent -= IndentSize;
if (Stack.back().HasValue)
newline();
OS << ']';
Stack.pop_back();
assert(!Stack.empty());
}
void llvm::json::OStream::objectBegin() {
valueBegin();
Stack.emplace_back();
Stack.back().Ctx = Object;
Indent += IndentSize;
OS << '{';
}
void llvm::json::OStream::objectEnd() {
assert(Stack.back().Ctx == Object);
Indent -= IndentSize;
if (Stack.back().HasValue)
newline();
OS << '}';
Stack.pop_back();
assert(!Stack.empty());
}
void llvm::json::OStream::attributeBegin(llvm::StringRef Key) {
assert(Stack.back().Ctx == Object);
if (Stack.back().HasValue)
OS << ',';
newline();
Stack.back().HasValue = true;
Stack.emplace_back();
Stack.back().Ctx = Singleton;
if (LLVM_LIKELY(isUTF8(Key))) {
quote(OS, Key);
} else {
assert(false && "Invalid UTF-8 in attribute key");
quote(OS, fixUTF8(Key));
}
OS.write(':');
if (IndentSize)
OS.write(' ');
}
void llvm::json::OStream::attributeEnd() {
assert(Stack.back().Ctx == Singleton);
assert(Stack.back().HasValue && "Attribute must have a value");
Stack.pop_back();
assert(Stack.back().Ctx == Object);
}
} // namespace json
} // namespace llvm
void llvm::format_provider<llvm::json::Value>::format(
const llvm::json::Value &E, raw_ostream &OS, StringRef Options) {
unsigned IndentAmount = 0;
if (!Options.empty() && Options.getAsInteger(/*Radix=*/10, IndentAmount))
llvm_unreachable("json::Value format options should be an integer");
json::OStream(OS, IndentAmount).value(E);
}