mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[YAML] Escape non-printable multibyte UTF8 in Output::scalarString.
The existing YAML Output::scalarString code path includes a partial and incorrect implementation of YAML escaping logic. In particular, the logic put in place in rL321283 escapes non-printable bytes only if they are not part of a multibyte UTF8 sequence; implicitly this means that all multibyte UTF8 sequences -- printable and non -- are passed through verbatim. The simplest solution to this is to direct the Output::scalarString method to use the standalone yaml::escape function, and this _almost_ works, except that the existing code in that function _over_ escapes: any multibyte UTF8 sequence is escaped, even printable ones. While this is permitted for YAML, it is also more aggressive (and hard to read for non-English locales) than necessary, and the entire point of rL321283 was to back off such aggressive over-escaping. So in this change, I have both redirected Output::scalarString to use yaml::escape _and_ modified yaml::escape to optionally restrict its escaping to non-printables. This preserves behaviour of any existing clients while giving them a path to more moderate escaping should they desire. Reviewers: JDevlieghere, thegameg, MatzeB, vladimir.plyashkun Reviewed By: thegameg Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D44863 llvm-svn: 328661
This commit is contained in:
parent
e26e4e4ebc
commit
24302d51b0
@ -73,8 +73,11 @@ bool dumpTokens(StringRef Input, raw_ostream &);
|
||||
/// \returns true if there was an error, false otherwise.
|
||||
bool scanTokens(StringRef Input);
|
||||
|
||||
/// \brief Escape \a Input for a double quoted scalar.
|
||||
std::string escape(StringRef Input);
|
||||
/// \brief Escape \a Input for a double quoted scalar; if \p EscapePrintable
|
||||
/// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is
|
||||
/// false, those UTF8 sequences encoding printable unicode scalars will not be
|
||||
/// escaped, but emitted verbatim.
|
||||
std::string escape(StringRef Input, bool EscapePrintable = true);
|
||||
|
||||
/// \brief This class represents a YAML stream potentially containing multiple
|
||||
/// documents.
|
||||
|
@ -26,6 +26,7 @@
|
||||
#include "llvm/Support/MemoryBuffer.h"
|
||||
#include "llvm/Support/SMLoc.h"
|
||||
#include "llvm/Support/SourceMgr.h"
|
||||
#include "llvm/Support/Unicode.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
@ -687,7 +688,7 @@ bool yaml::scanTokens(StringRef Input) {
|
||||
return true;
|
||||
}
|
||||
|
||||
std::string yaml::escape(StringRef Input) {
|
||||
std::string yaml::escape(StringRef Input, bool EscapePrintable) {
|
||||
std::string EscapedInput;
|
||||
for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
|
||||
if (*i == '\\')
|
||||
@ -734,6 +735,9 @@ std::string yaml::escape(StringRef Input) {
|
||||
EscapedInput += "\\L";
|
||||
else if (UnicodeScalarValue.first == 0x2029)
|
||||
EscapedInput += "\\P";
|
||||
else if (!EscapePrintable &&
|
||||
sys::unicode::isPrintable(UnicodeScalarValue.first))
|
||||
EscapedInput += StringRef(i, UnicodeScalarValue.second);
|
||||
else {
|
||||
std::string HexStr = utohexstr(UnicodeScalarValue.first);
|
||||
if (HexStr.size() <= 2)
|
||||
|
@ -638,39 +638,22 @@ void Output::scalarString(StringRef &S, QuotingType MustQuote) {
|
||||
const char *Base = S.data();
|
||||
|
||||
const char *const Quote = MustQuote == QuotingType::Single ? "'" : "\"";
|
||||
const char QuoteChar = MustQuote == QuotingType::Single ? '\'' : '"';
|
||||
|
||||
output(Quote); // Starting quote.
|
||||
|
||||
// When using single-quoted strings, any single quote ' must be doubled to be
|
||||
// escaped.
|
||||
// When using double-quoted strings, print \x + hex for non-printable ASCII
|
||||
// characters, and escape double quotes.
|
||||
// When using double-quoted strings (and only in that case), non-printable characters may be
|
||||
// present, and will be escaped using a variety of unicode-scalar and special short-form
|
||||
// escapes. This is handled in yaml::escape.
|
||||
if (MustQuote == QuotingType::Double) {
|
||||
output(yaml::escape(Base, /* EscapePrintable= */ false));
|
||||
this->outputUpToEndOfLine(Quote);
|
||||
return;
|
||||
}
|
||||
|
||||
// When using single-quoted strings, any single quote ' must be doubled to be escaped.
|
||||
while (j < End) {
|
||||
if (S[j] == QuoteChar) { // Escape quotes.
|
||||
output(StringRef(&Base[i], j - i)); // "flush".
|
||||
if (MustQuote == QuotingType::Double) { // Print it as \"
|
||||
output(StringLiteral("\\"));
|
||||
output(StringRef(Quote, 1));
|
||||
} else { // Single
|
||||
output(StringLiteral("''")); // Print it as ''
|
||||
}
|
||||
i = j + 1;
|
||||
} else if (MustQuote == QuotingType::Double &&
|
||||
!sys::unicode::isPrintable(S[j]) && (S[j] & 0x80) == 0) {
|
||||
// If we're double quoting non-printable characters, we prefer printing
|
||||
// them as "\x" + their hex representation. Note that special casing is
|
||||
// needed for UTF-8, where a byte may be part of a UTF-8 sequence and
|
||||
// appear as non-printable, in which case we want to print the correct
|
||||
// unicode character and not its hex representation.
|
||||
output(StringRef(&Base[i], j - i)); // "flush"
|
||||
output(StringLiteral("\\x"));
|
||||
|
||||
// Output the byte 0x0F as \x0f.
|
||||
auto FormattedHex = format_hex_no_prefix(S[j], 2);
|
||||
Out << FormattedHex;
|
||||
Column += 4; // one for the '\', one for the 'x', and two for the hex
|
||||
|
||||
if (S[j] == '\'') { // Escape quotes.
|
||||
output(StringRef(&Base[i], j - i)); // "flush".
|
||||
output(StringLiteral("''")); // Print it as ''
|
||||
i = j + 1;
|
||||
}
|
||||
++j;
|
||||
|
@ -2464,7 +2464,10 @@ static void TestEscaped(llvm::StringRef Input, llvm::StringRef Expected) {
|
||||
yamlize(xout, Input, true, Ctx);
|
||||
|
||||
ostr.flush();
|
||||
EXPECT_EQ(Expected, out);
|
||||
|
||||
// Make a separate StringRef so we get nice byte-by-byte output.
|
||||
llvm::StringRef Got(out);
|
||||
EXPECT_EQ(Expected, Got);
|
||||
}
|
||||
|
||||
TEST(YAMLIO, TestEscaped) {
|
||||
@ -2485,4 +2488,17 @@ TEST(YAMLIO, TestEscaped) {
|
||||
// UTF8 with single quote inside double quote
|
||||
TestEscaped("parameter 'параметр' is unused",
|
||||
"\"parameter 'параметр' is unused\"");
|
||||
|
||||
// String with embedded non-printable multibyte UTF-8 sequence (U+200B
|
||||
// zero-width space). The thing to test here is that we emit a
|
||||
// unicode-scalar level escape like \uNNNN (at the YAML level), and don't
|
||||
// just pass the UTF-8 byte sequence through as with quoted printables.
|
||||
TestEscaped("foo\u200Bbar", "\"foo\\u200Bbar\"");
|
||||
{
|
||||
const unsigned char foobar[10] = {'f', 'o', 'o',
|
||||
0xE2, 0x80, 0x8B, // UTF-8 of U+200B
|
||||
'b', 'a', 'r',
|
||||
0x0};
|
||||
TestEscaped((char const *)foobar, "\"foo\\u200Bbar\"");
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user