1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[YAML] Escape non-printable multibyte UTF8 in Output::scalarString.

The existing YAML Output::scalarString code path includes a partial and
incorrect implementation of YAML escaping logic. In particular, the logic put
in place in rL321283 escapes non-printable bytes only if they are not part of a
multibyte UTF8 sequence; implicitly this means that all multibyte UTF8
sequences -- printable and non -- are passed through verbatim.

The simplest solution to this is to direct the Output::scalarString method to
use the standalone yaml::escape function, and this _almost_ works, except that
the existing code in that function _over_ escapes: any multibyte UTF8 sequence
is escaped, even printable ones. While this is permitted for YAML, it is also
more aggressive (and hard to read for non-English locales) than necessary,
and the entire point of rL321283 was to back off such aggressive over-escaping.

So in this change, I have both redirected Output::scalarString to use
yaml::escape _and_ modified yaml::escape to optionally restrict its escaping to
non-printables. This preserves behaviour of any existing clients while giving
them a path to more moderate escaping should they desire.

Reviewers: JDevlieghere, thegameg, MatzeB, vladimir.plyashkun

Reviewed By: thegameg

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D44863

llvm-svn: 328661
This commit is contained in:
Graydon Hoare 2018-03-27 19:52:45 +00:00
parent e26e4e4ebc
commit 24302d51b0
4 changed files with 40 additions and 34 deletions

View File

@ -73,8 +73,11 @@ bool dumpTokens(StringRef Input, raw_ostream &);
/// \returns true if there was an error, false otherwise.
bool scanTokens(StringRef Input);
/// \brief Escape \a Input for a double quoted scalar.
std::string escape(StringRef Input);
/// \brief Escape \a Input for a double quoted scalar; if \p EscapePrintable
/// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is
/// false, those UTF8 sequences encoding printable unicode scalars will not be
/// escaped, but emitted verbatim.
std::string escape(StringRef Input, bool EscapePrintable = true);
/// \brief This class represents a YAML stream potentially containing multiple
/// documents.

View File

@ -26,6 +26,7 @@
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/SMLoc.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/Unicode.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
@ -687,7 +688,7 @@ bool yaml::scanTokens(StringRef Input) {
return true;
}
std::string yaml::escape(StringRef Input) {
std::string yaml::escape(StringRef Input, bool EscapePrintable) {
std::string EscapedInput;
for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
if (*i == '\\')
@ -734,6 +735,9 @@ std::string yaml::escape(StringRef Input) {
EscapedInput += "\\L";
else if (UnicodeScalarValue.first == 0x2029)
EscapedInput += "\\P";
else if (!EscapePrintable &&
sys::unicode::isPrintable(UnicodeScalarValue.first))
EscapedInput += StringRef(i, UnicodeScalarValue.second);
else {
std::string HexStr = utohexstr(UnicodeScalarValue.first);
if (HexStr.size() <= 2)

View File

@ -638,39 +638,22 @@ void Output::scalarString(StringRef &S, QuotingType MustQuote) {
const char *Base = S.data();
const char *const Quote = MustQuote == QuotingType::Single ? "'" : "\"";
const char QuoteChar = MustQuote == QuotingType::Single ? '\'' : '"';
output(Quote); // Starting quote.
// When using single-quoted strings, any single quote ' must be doubled to be
// escaped.
// When using double-quoted strings, print \x + hex for non-printable ASCII
// characters, and escape double quotes.
// When using double-quoted strings (and only in that case), non-printable characters may be
// present, and will be escaped using a variety of unicode-scalar and special short-form
// escapes. This is handled in yaml::escape.
if (MustQuote == QuotingType::Double) {
output(yaml::escape(Base, /* EscapePrintable= */ false));
this->outputUpToEndOfLine(Quote);
return;
}
// When using single-quoted strings, any single quote ' must be doubled to be escaped.
while (j < End) {
if (S[j] == QuoteChar) { // Escape quotes.
output(StringRef(&Base[i], j - i)); // "flush".
if (MustQuote == QuotingType::Double) { // Print it as \"
output(StringLiteral("\\"));
output(StringRef(Quote, 1));
} else { // Single
output(StringLiteral("''")); // Print it as ''
}
i = j + 1;
} else if (MustQuote == QuotingType::Double &&
!sys::unicode::isPrintable(S[j]) && (S[j] & 0x80) == 0) {
// If we're double quoting non-printable characters, we prefer printing
// them as "\x" + their hex representation. Note that special casing is
// needed for UTF-8, where a byte may be part of a UTF-8 sequence and
// appear as non-printable, in which case we want to print the correct
// unicode character and not its hex representation.
output(StringRef(&Base[i], j - i)); // "flush"
output(StringLiteral("\\x"));
// Output the byte 0x0F as \x0f.
auto FormattedHex = format_hex_no_prefix(S[j], 2);
Out << FormattedHex;
Column += 4; // one for the '\', one for the 'x', and two for the hex
if (S[j] == '\'') { // Escape quotes.
output(StringRef(&Base[i], j - i)); // "flush".
output(StringLiteral("''")); // Print it as ''
i = j + 1;
}
++j;

View File

@ -2464,7 +2464,10 @@ static void TestEscaped(llvm::StringRef Input, llvm::StringRef Expected) {
yamlize(xout, Input, true, Ctx);
ostr.flush();
EXPECT_EQ(Expected, out);
// Make a separate StringRef so we get nice byte-by-byte output.
llvm::StringRef Got(out);
EXPECT_EQ(Expected, Got);
}
TEST(YAMLIO, TestEscaped) {
@ -2485,4 +2488,17 @@ TEST(YAMLIO, TestEscaped) {
// UTF8 with single quote inside double quote
TestEscaped("parameter 'параметр' is unused",
"\"parameter 'параметр' is unused\"");
// String with embedded non-printable multibyte UTF-8 sequence (U+200B
// zero-width space). The thing to test here is that we emit a
// unicode-scalar level escape like \uNNNN (at the YAML level), and don't
// just pass the UTF-8 byte sequence through as with quoted printables.
TestEscaped("foo\u200Bbar", "\"foo\\u200Bbar\"");
{
const unsigned char foobar[10] = {'f', 'o', 'o',
0xE2, 0x80, 0x8B, // UTF-8 of U+200B
'b', 'a', 'r',
0x0};
TestEscaped((char const *)foobar, "\"foo\\u200Bbar\"");
}
}