mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
Move generic isPrint and columnWidth implementations to a separate header/source to allow using both generic and system-dependent versions on win32.
Summary: This is needed so we can use generic columnWidthUTF8 in clang-format on win32 simultaneously with a separate system-dependent implementations of isPrint/columnWidth in TextDiagnostic.cpp to avoid attempts to print Unicode characters using narrow-character interfaces (which is not supported on Windows, and we'll have to figure out how to handle this). Reviewers: jordan_rose Reviewed By: jordan_rose CC: llvm-commits, klimek Differential Revision: http://llvm-reviews.chandlerc.com/D1559 llvm-svn: 189952
This commit is contained in:
parent
da89e5d74e
commit
6646ff4786
62
include/llvm/Support/Unicode.h
Normal file
62
include/llvm/Support/Unicode.h
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
//===- llvm/Support/Unicode.h - Unicode character properties -*- C++ -*-=====//
|
||||||
|
//
|
||||||
|
// The LLVM Compiler Infrastructure
|
||||||
|
//
|
||||||
|
// This file is distributed under the University of Illinois Open Source
|
||||||
|
// License. See LICENSE.TXT for details.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
// This file defines functions that allow querying certain properties of Unicode
|
||||||
|
// characters.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "llvm/ADT/StringRef.h"
|
||||||
|
|
||||||
|
namespace llvm {
|
||||||
|
namespace sys {
|
||||||
|
namespace unicode {
|
||||||
|
|
||||||
|
enum ColumnWidthErrors {
|
||||||
|
ErrorInvalidUTF8 = -2,
|
||||||
|
ErrorNonPrintableCharacter = -1
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Determines if a character is likely to be displayed correctly on the
|
||||||
|
/// terminal. Exact implementation would have to depend on the specific
|
||||||
|
/// terminal, so we define the semantic that should be suitable for generic case
|
||||||
|
/// of a terminal capable to output Unicode characters.
|
||||||
|
///
|
||||||
|
/// All characters from the Unicode code point range are considered printable
|
||||||
|
/// except for:
|
||||||
|
/// * C0 and C1 control character ranges;
|
||||||
|
/// * default ignorable code points as per 5.21 of
|
||||||
|
/// http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
|
||||||
|
/// except for U+00AD SOFT HYPHEN, as it's actually displayed on most
|
||||||
|
/// terminals;
|
||||||
|
/// * format characters (category = Cf);
|
||||||
|
/// * surrogates (category = Cs);
|
||||||
|
/// * unassigned characters (category = Cn).
|
||||||
|
/// \return true if the character is considered printable.
|
||||||
|
bool isPrintable(int UCS);
|
||||||
|
|
||||||
|
/// Gets the number of positions the UTF8-encoded \p Text is likely to occupy
|
||||||
|
/// when output on a terminal ("character width"). This depends on the
|
||||||
|
/// implementation of the terminal, and there's no standard definition of
|
||||||
|
/// character width.
|
||||||
|
///
|
||||||
|
/// The implementation defines it in a way that is expected to be compatible
|
||||||
|
/// with a generic Unicode-capable terminal.
|
||||||
|
///
|
||||||
|
/// \return Character width:
|
||||||
|
/// * ErrorNonPrintableCharacter (-1) if \p Text contains non-printable
|
||||||
|
/// characters (as identified by isPrintable);
|
||||||
|
/// * 0 for each non-spacing and enclosing combining mark;
|
||||||
|
/// * 2 for each CJK character excluding halfwidth forms;
|
||||||
|
/// * 1 for each of the remaining characters.
|
||||||
|
int columnWidthUTF8(StringRef Text);
|
||||||
|
|
||||||
|
} // namespace unicode
|
||||||
|
} // namespace sys
|
||||||
|
} // namespace llvm
|
@ -54,6 +54,7 @@ add_llvm_library(LLVMSupport
|
|||||||
ToolOutputFile.cpp
|
ToolOutputFile.cpp
|
||||||
Triple.cpp
|
Triple.cpp
|
||||||
Twine.cpp
|
Twine.cpp
|
||||||
|
Unicode.cpp
|
||||||
YAMLParser.cpp
|
YAMLParser.cpp
|
||||||
YAMLTraits.cpp
|
YAMLTraits.cpp
|
||||||
raw_os_ostream.cpp
|
raw_os_ostream.cpp
|
||||||
|
@ -1,10 +1,31 @@
|
|||||||
#include "llvm/Support/Locale.h"
|
#include "llvm/Support/Locale.h"
|
||||||
#include "llvm/Config/config.h"
|
#include "llvm/Support/Unicode.h"
|
||||||
|
|
||||||
#ifdef __APPLE__
|
namespace llvm {
|
||||||
#include "LocaleXlocale.inc"
|
namespace sys {
|
||||||
#elif LLVM_ON_WIN32
|
namespace locale {
|
||||||
#include "LocaleWindows.inc"
|
|
||||||
|
int columnWidth(StringRef Text) {
|
||||||
|
#if LLVM_ON_WIN32
|
||||||
|
return Text.size();
|
||||||
#else
|
#else
|
||||||
#include "LocaleGeneric.inc"
|
return llvm::sys::unicode::columnWidthUTF8(Text);
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isPrint(int UCS) {
|
||||||
|
#if LLVM_ON_WIN32
|
||||||
|
// Restrict characters that we'll try to print to the the lower part of ASCII
|
||||||
|
// except for the control characters (0x20 - 0x7E). In general one can not
|
||||||
|
// reliably output code points U+0080 and higher using narrow character C/C++
|
||||||
|
// output functions in Windows, because the meaning of the upper 128 codes is
|
||||||
|
// determined by the active code page in the console.
|
||||||
|
return ' ' <= UCS && UCS <= '~';
|
||||||
|
#else
|
||||||
|
return llvm::sys::unicode::isPrintable(UCS);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace locale
|
||||||
|
} // namespace sys
|
||||||
|
} // namespace llvm
|
||||||
|
@ -1,15 +0,0 @@
|
|||||||
namespace llvm {
|
|
||||||
namespace sys {
|
|
||||||
namespace locale {
|
|
||||||
|
|
||||||
int columnWidth(StringRef s) {
|
|
||||||
return s.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool isPrint(int c) {
|
|
||||||
return ' ' <= c && c <= '~';
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,61 +0,0 @@
|
|||||||
#include "llvm/ADT/SmallString.h"
|
|
||||||
#include "llvm/ADT/SmallVector.h"
|
|
||||||
#include "llvm/Support/ManagedStatic.h"
|
|
||||||
#include <cassert>
|
|
||||||
#include <xlocale.h>
|
|
||||||
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
struct locale_holder {
|
|
||||||
locale_holder()
|
|
||||||
: l(newlocale(LC_CTYPE_MASK,"en_US.UTF-8",LC_GLOBAL_LOCALE))
|
|
||||||
{
|
|
||||||
assert(NULL!=l);
|
|
||||||
}
|
|
||||||
~locale_holder() {
|
|
||||||
freelocale(l);
|
|
||||||
}
|
|
||||||
|
|
||||||
int mbswidth(llvm::SmallString<16> s) const {
|
|
||||||
// this implementation assumes no '\0' in s
|
|
||||||
assert(s.size()==strlen(s.c_str()));
|
|
||||||
|
|
||||||
size_t size = mbstowcs_l(NULL,s.c_str(),0,l);
|
|
||||||
assert(size!=(size_t)-1);
|
|
||||||
if (size==0)
|
|
||||||
return 0;
|
|
||||||
llvm::SmallVector<wchar_t,200> ws(size);
|
|
||||||
size = mbstowcs_l(&ws[0],s.c_str(),ws.size(),l);
|
|
||||||
assert(ws.size()==size);
|
|
||||||
return wcswidth_l(&ws[0],ws.size(),l);
|
|
||||||
}
|
|
||||||
|
|
||||||
int isprint(int c) const {
|
|
||||||
return iswprint_l(c,l);
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
|
|
||||||
locale_t l;
|
|
||||||
};
|
|
||||||
|
|
||||||
llvm::ManagedStatic<locale_holder> l;
|
|
||||||
}
|
|
||||||
|
|
||||||
namespace llvm {
|
|
||||||
namespace sys {
|
|
||||||
namespace locale {
|
|
||||||
|
|
||||||
int columnWidth(StringRef s) {
|
|
||||||
int width = l->mbswidth(s);
|
|
||||||
assert(width>=0);
|
|
||||||
return width;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool isPrint(int c) {
|
|
||||||
return l->isprint(c);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,4 +1,4 @@
|
|||||||
//===- llvm/Support/LocaleGeneric.inc - Locale-dependent stuff -*- C++ -*-===//
|
//===- llvm/Support/Unicode.cpp - Unicode character properties -*- C++ -*-===//
|
||||||
//
|
//
|
||||||
// The LLVM Compiler Infrastructure
|
// The LLVM Compiler Infrastructure
|
||||||
//
|
//
|
||||||
@ -7,41 +7,20 @@
|
|||||||
//
|
//
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
//
|
//
|
||||||
// This file implements llvm::sys::locale::columnWidth and
|
// This file implements functions that allow querying certain properties of
|
||||||
// llvm::sys::locale::isPrint functions for UTF-8 locales.
|
// Unicode characters.
|
||||||
//
|
//
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
#include "llvm/ADT/ArrayRef.h"
|
#include "llvm/Support/Unicode.h"
|
||||||
#include "llvm/Support/ConvertUTF.h"
|
#include "llvm/Support/ConvertUTF.h"
|
||||||
#include "llvm/Support/raw_ostream.h"
|
|
||||||
#include "llvm/Support/UnicodeCharRanges.h"
|
#include "llvm/Support/UnicodeCharRanges.h"
|
||||||
|
|
||||||
namespace llvm {
|
namespace llvm {
|
||||||
namespace sys {
|
namespace sys {
|
||||||
namespace locale {
|
namespace unicode {
|
||||||
|
|
||||||
enum ColumnWidthErrors {
|
bool isPrintable(int UCS) {
|
||||||
ErrorInvalidUTF8 = -2,
|
|
||||||
ErrorNonPrintableCharacter = -1
|
|
||||||
};
|
|
||||||
|
|
||||||
/// Determines if a character is likely to be displayed correctly on the
|
|
||||||
/// terminal. Exact implementation would have to depend on the specific
|
|
||||||
/// terminal, so we define the semantic that should be suitable for generic case
|
|
||||||
/// of a terminal capable to output Unicode characters.
|
|
||||||
/// All characters from the Unicode codepoint range are considered printable
|
|
||||||
/// except for:
|
|
||||||
/// * C0 and C1 control character ranges;
|
|
||||||
/// * default ignorable code points as per 5.21 of
|
|
||||||
/// http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
|
|
||||||
/// except for U+00AD SOFT HYPHEN, as it's actually displayed on most
|
|
||||||
/// terminals;
|
|
||||||
/// * format characters (category = Cf);
|
|
||||||
/// * surrogates (category = Cs);
|
|
||||||
/// * unassigned characters (category = Cn).
|
|
||||||
/// \return true if the character is considered printable.
|
|
||||||
bool isPrint(int UCS) {
|
|
||||||
// Sorted list of non-overlapping intervals of code points that are not
|
// Sorted list of non-overlapping intervals of code points that are not
|
||||||
// supposed to be printable.
|
// supposed to be printable.
|
||||||
static const UnicodeCharRange NonPrintableRanges[] = {
|
static const UnicodeCharRange NonPrintableRanges[] = {
|
||||||
@ -241,13 +220,13 @@ bool isPrint(int UCS) {
|
|||||||
/// with a generic Unicode-capable terminal.
|
/// with a generic Unicode-capable terminal.
|
||||||
/// \return Character width:
|
/// \return Character width:
|
||||||
/// * ErrorNonPrintableCharacter (-1) for non-printable characters (as
|
/// * ErrorNonPrintableCharacter (-1) for non-printable characters (as
|
||||||
/// identified by isPrint);
|
/// identified by isPrintable);
|
||||||
/// * 0 for non-spacing and enclosing combining marks;
|
/// * 0 for non-spacing and enclosing combining marks;
|
||||||
/// * 2 for CJK characters excluding halfwidth forms;
|
/// * 2 for CJK characters excluding halfwidth forms;
|
||||||
/// * 1 for all remaining characters.
|
/// * 1 for all remaining characters.
|
||||||
static inline int charWidth(int UCS)
|
static inline int charWidth(int UCS)
|
||||||
{
|
{
|
||||||
if (!isPrint(UCS))
|
if (!isPrintable(UCS))
|
||||||
return ErrorNonPrintableCharacter;
|
return ErrorNonPrintableCharacter;
|
||||||
|
|
||||||
// Sorted list of non-spacing and enclosing combining mark intervals as
|
// Sorted list of non-spacing and enclosing combining mark intervals as
|
||||||
@ -361,7 +340,7 @@ static inline int charWidth(int UCS)
|
|||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
int columnWidth(StringRef Text) {
|
int columnWidthUTF8(StringRef Text) {
|
||||||
unsigned ColumnWidth = 0;
|
unsigned ColumnWidth = 0;
|
||||||
unsigned Length;
|
unsigned Length;
|
||||||
for (size_t i = 0, e = Text.size(); i < e; i += Length) {
|
for (size_t i = 0, e = Text.size(); i < e; i += Length) {
|
||||||
@ -382,6 +361,7 @@ int columnWidth(StringRef Text) {
|
|||||||
return ColumnWidth;
|
return ColumnWidth;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} // namespace unicode
|
||||||
}
|
} // namespace sys
|
||||||
}
|
} // namespace llvm
|
||||||
|
|
@ -19,7 +19,6 @@ add_llvm_unittest(SupportTests
|
|||||||
FileOutputBufferTest.cpp
|
FileOutputBufferTest.cpp
|
||||||
IntegersSubsetTest.cpp
|
IntegersSubsetTest.cpp
|
||||||
LeakDetectorTest.cpp
|
LeakDetectorTest.cpp
|
||||||
LocaleTest.cpp
|
|
||||||
LockFileManagerTest.cpp
|
LockFileManagerTest.cpp
|
||||||
ManagedStatic.cpp
|
ManagedStatic.cpp
|
||||||
MathExtrasTest.cpp
|
MathExtrasTest.cpp
|
||||||
@ -32,6 +31,7 @@ add_llvm_unittest(SupportTests
|
|||||||
RegexTest.cpp
|
RegexTest.cpp
|
||||||
SwapByteOrderTest.cpp
|
SwapByteOrderTest.cpp
|
||||||
TimeValueTest.cpp
|
TimeValueTest.cpp
|
||||||
|
UnicodeTest.cpp
|
||||||
ValueHandleTest.cpp
|
ValueHandleTest.cpp
|
||||||
YAMLIOTest.cpp
|
YAMLIOTest.cpp
|
||||||
YAMLParserTest.cpp
|
YAMLParserTest.cpp
|
||||||
|
@ -1,104 +0,0 @@
|
|||||||
//===- unittests/Support/LocaleTest.cpp - Locale.h tests ------------------===//
|
|
||||||
//
|
|
||||||
// The LLVM Compiler Infrastructure
|
|
||||||
//
|
|
||||||
// This file is distributed under the University of Illinois Open Source
|
|
||||||
// License. See LICENSE.TXT for details.
|
|
||||||
//
|
|
||||||
//===----------------------------------------------------------------------===//
|
|
||||||
|
|
||||||
#include "llvm/Support/Locale.h"
|
|
||||||
#include "gtest/gtest.h"
|
|
||||||
|
|
||||||
namespace llvm {
|
|
||||||
namespace sys {
|
|
||||||
namespace locale {
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
// FIXME: WIN32 implementation is incorrect. We should consider using the one
|
|
||||||
// from LocaleGeneric.inc for WIN32.
|
|
||||||
#ifndef _WIN32
|
|
||||||
TEST(Locale, columnWidth) {
|
|
||||||
// FIXME: This test fails with MacOSX implementation of columnWidth.
|
|
||||||
#ifndef __APPLE__
|
|
||||||
EXPECT_EQ(0, columnWidth(""));
|
|
||||||
EXPECT_EQ(1, columnWidth(" "));
|
|
||||||
EXPECT_EQ(1, columnWidth("a"));
|
|
||||||
EXPECT_EQ(1, columnWidth("~"));
|
|
||||||
|
|
||||||
EXPECT_EQ(6, columnWidth("abcdef"));
|
|
||||||
|
|
||||||
EXPECT_EQ(-1, columnWidth("\x01"));
|
|
||||||
EXPECT_EQ(-1, columnWidth("aaaaaaaaaa\x01"));
|
|
||||||
EXPECT_EQ(-1, columnWidth("\342\200\213")); // 200B ZERO WIDTH SPACE
|
|
||||||
|
|
||||||
// 00AD SOFT HYPHEN is displayed on most terminals as a space or a dash. Some
|
|
||||||
// text editors display it only when a line is broken at it, some use it as a
|
|
||||||
// line-break hint, but don't display. We choose terminal-oriented
|
|
||||||
// interpretation.
|
|
||||||
EXPECT_EQ(1, columnWidth("\302\255"));
|
|
||||||
|
|
||||||
EXPECT_EQ(0, columnWidth("\314\200")); // 0300 COMBINING GRAVE ACCENT
|
|
||||||
EXPECT_EQ(1, columnWidth("\340\270\201")); // 0E01 THAI CHARACTER KO KAI
|
|
||||||
EXPECT_EQ(2, columnWidth("\344\270\200")); // CJK UNIFIED IDEOGRAPH-4E00
|
|
||||||
|
|
||||||
EXPECT_EQ(4, columnWidth("\344\270\200\344\270\200"));
|
|
||||||
EXPECT_EQ(3, columnWidth("q\344\270\200"));
|
|
||||||
EXPECT_EQ(3, columnWidth("\314\200\340\270\201\344\270\200"));
|
|
||||||
|
|
||||||
// Invalid UTF-8 strings, columnWidth should error out.
|
|
||||||
EXPECT_EQ(-2, columnWidth("\344"));
|
|
||||||
EXPECT_EQ(-2, columnWidth("\344\270"));
|
|
||||||
EXPECT_EQ(-2, columnWidth("\344\270\033"));
|
|
||||||
EXPECT_EQ(-2, columnWidth("\344\270\300"));
|
|
||||||
EXPECT_EQ(-2, columnWidth("\377\366\355"));
|
|
||||||
|
|
||||||
EXPECT_EQ(-2, columnWidth("qwer\344"));
|
|
||||||
EXPECT_EQ(-2, columnWidth("qwer\344\270"));
|
|
||||||
EXPECT_EQ(-2, columnWidth("qwer\344\270\033"));
|
|
||||||
EXPECT_EQ(-2, columnWidth("qwer\344\270\300"));
|
|
||||||
EXPECT_EQ(-2, columnWidth("qwer\377\366\355"));
|
|
||||||
|
|
||||||
// UTF-8 sequences longer than 4 bytes correspond to unallocated Unicode
|
|
||||||
// characters.
|
|
||||||
EXPECT_EQ(-2, columnWidth("\370\200\200\200\200")); // U+200000
|
|
||||||
EXPECT_EQ(-2, columnWidth("\374\200\200\200\200\200")); // U+4000000
|
|
||||||
#endif // __APPLE__
|
|
||||||
}
|
|
||||||
|
|
||||||
TEST(Locale, isPrint) {
|
|
||||||
EXPECT_FALSE(isPrint(0)); // <control-0000>-<control-001F>
|
|
||||||
EXPECT_FALSE(isPrint(0x01));
|
|
||||||
EXPECT_FALSE(isPrint(0x1F));
|
|
||||||
EXPECT_TRUE(isPrint(' '));
|
|
||||||
EXPECT_TRUE(isPrint('A'));
|
|
||||||
EXPECT_TRUE(isPrint('~'));
|
|
||||||
EXPECT_FALSE(isPrint(0x7F)); // <control-007F>..<control-009F>
|
|
||||||
EXPECT_FALSE(isPrint(0x90));
|
|
||||||
EXPECT_FALSE(isPrint(0x9F));
|
|
||||||
|
|
||||||
EXPECT_TRUE(isPrint(0xAC));
|
|
||||||
EXPECT_TRUE(isPrint(0xAD)); // SOFT HYPHEN is displayed on most terminals
|
|
||||||
// as either a space or a dash.
|
|
||||||
EXPECT_TRUE(isPrint(0xAE));
|
|
||||||
|
|
||||||
// MacOS implementation doesn't think it's printable.
|
|
||||||
#ifndef __APPLE__
|
|
||||||
EXPECT_TRUE(isPrint(0x0377)); // GREEK SMALL LETTER PAMPHYLIAN DIGAMMA
|
|
||||||
#endif // __APPLE__
|
|
||||||
EXPECT_FALSE(isPrint(0x0378)); // <reserved-0378>..<reserved-0379>
|
|
||||||
|
|
||||||
EXPECT_FALSE(isPrint(0x0600)); // ARABIC NUMBER SIGN
|
|
||||||
|
|
||||||
EXPECT_FALSE(isPrint(0x1FFFF)); // <reserved-1F774>..<noncharacter-1FFFF>
|
|
||||||
EXPECT_TRUE(isPrint(0x20000)); // CJK UNIFIED IDEOGRAPH-20000
|
|
||||||
|
|
||||||
EXPECT_FALSE(isPrint(0x10FFFF)); // noncharacter
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // _WIN32
|
|
||||||
|
|
||||||
} // namespace
|
|
||||||
} // namespace locale
|
|
||||||
} // namespace sys
|
|
||||||
} // namespace llvm
|
|
93
unittests/Support/UnicodeTest.cpp
Normal file
93
unittests/Support/UnicodeTest.cpp
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
//===- unittests/Support/UnicodeTest.cpp - Unicode.h tests ----------------===//
|
||||||
|
//
|
||||||
|
// The LLVM Compiler Infrastructure
|
||||||
|
//
|
||||||
|
// This file is distributed under the University of Illinois Open Source
|
||||||
|
// License. See LICENSE.TXT for details.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "llvm/Support/Unicode.h"
|
||||||
|
#include "gtest/gtest.h"
|
||||||
|
|
||||||
|
namespace llvm {
|
||||||
|
namespace sys {
|
||||||
|
namespace unicode {
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
TEST(Unicode, columnWidthUTF8) {
|
||||||
|
EXPECT_EQ(0, columnWidthUTF8(""));
|
||||||
|
EXPECT_EQ(1, columnWidthUTF8(" "));
|
||||||
|
EXPECT_EQ(1, columnWidthUTF8("a"));
|
||||||
|
EXPECT_EQ(1, columnWidthUTF8("~"));
|
||||||
|
|
||||||
|
EXPECT_EQ(6, columnWidthUTF8("abcdef"));
|
||||||
|
|
||||||
|
EXPECT_EQ(-1, columnWidthUTF8("\x01"));
|
||||||
|
EXPECT_EQ(-1, columnWidthUTF8("aaaaaaaaaa\x01"));
|
||||||
|
EXPECT_EQ(-1, columnWidthUTF8("\342\200\213")); // 200B ZERO WIDTH SPACE
|
||||||
|
|
||||||
|
// 00AD SOFT HYPHEN is displayed on most terminals as a space or a dash. Some
|
||||||
|
// text editors display it only when a line is broken at it, some use it as a
|
||||||
|
// line-break hint, but don't display. We choose terminal-oriented
|
||||||
|
// interpretation.
|
||||||
|
EXPECT_EQ(1, columnWidthUTF8("\302\255"));
|
||||||
|
|
||||||
|
EXPECT_EQ(0, columnWidthUTF8("\314\200")); // 0300 COMBINING GRAVE ACCENT
|
||||||
|
EXPECT_EQ(1, columnWidthUTF8("\340\270\201")); // 0E01 THAI CHARACTER KO KAI
|
||||||
|
EXPECT_EQ(2, columnWidthUTF8("\344\270\200")); // CJK UNIFIED IDEOGRAPH-4E00
|
||||||
|
|
||||||
|
EXPECT_EQ(4, columnWidthUTF8("\344\270\200\344\270\200"));
|
||||||
|
EXPECT_EQ(3, columnWidthUTF8("q\344\270\200"));
|
||||||
|
EXPECT_EQ(3, columnWidthUTF8("\314\200\340\270\201\344\270\200"));
|
||||||
|
|
||||||
|
// Invalid UTF-8 strings, columnWidthUTF8 should error out.
|
||||||
|
EXPECT_EQ(-2, columnWidthUTF8("\344"));
|
||||||
|
EXPECT_EQ(-2, columnWidthUTF8("\344\270"));
|
||||||
|
EXPECT_EQ(-2, columnWidthUTF8("\344\270\033"));
|
||||||
|
EXPECT_EQ(-2, columnWidthUTF8("\344\270\300"));
|
||||||
|
EXPECT_EQ(-2, columnWidthUTF8("\377\366\355"));
|
||||||
|
|
||||||
|
EXPECT_EQ(-2, columnWidthUTF8("qwer\344"));
|
||||||
|
EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270"));
|
||||||
|
EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\033"));
|
||||||
|
EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\300"));
|
||||||
|
EXPECT_EQ(-2, columnWidthUTF8("qwer\377\366\355"));
|
||||||
|
|
||||||
|
// UTF-8 sequences longer than 4 bytes correspond to unallocated Unicode
|
||||||
|
// characters.
|
||||||
|
EXPECT_EQ(-2, columnWidthUTF8("\370\200\200\200\200")); // U+200000
|
||||||
|
EXPECT_EQ(-2, columnWidthUTF8("\374\200\200\200\200\200")); // U+4000000
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(Unicode, isPrintable) {
|
||||||
|
EXPECT_FALSE(isPrintable(0)); // <control-0000>-<control-001F>
|
||||||
|
EXPECT_FALSE(isPrintable(0x01));
|
||||||
|
EXPECT_FALSE(isPrintable(0x1F));
|
||||||
|
EXPECT_TRUE(isPrintable(' '));
|
||||||
|
EXPECT_TRUE(isPrintable('A'));
|
||||||
|
EXPECT_TRUE(isPrintable('~'));
|
||||||
|
EXPECT_FALSE(isPrintable(0x7F)); // <control-007F>..<control-009F>
|
||||||
|
EXPECT_FALSE(isPrintable(0x90));
|
||||||
|
EXPECT_FALSE(isPrintable(0x9F));
|
||||||
|
|
||||||
|
EXPECT_TRUE(isPrintable(0xAC));
|
||||||
|
EXPECT_TRUE(isPrintable(0xAD)); // SOFT HYPHEN is displayed on most terminals
|
||||||
|
// as either a space or a dash.
|
||||||
|
EXPECT_TRUE(isPrintable(0xAE));
|
||||||
|
|
||||||
|
EXPECT_TRUE(isPrintable(0x0377)); // GREEK SMALL LETTER PAMPHYLIAN DIGAMMA
|
||||||
|
EXPECT_FALSE(isPrintable(0x0378)); // <reserved-0378>..<reserved-0379>
|
||||||
|
|
||||||
|
EXPECT_FALSE(isPrintable(0x0600)); // ARABIC NUMBER SIGN
|
||||||
|
|
||||||
|
EXPECT_FALSE(isPrintable(0x1FFFF)); // <reserved-1F774>..<noncharacter-1FFFF>
|
||||||
|
EXPECT_TRUE(isPrintable(0x20000)); // CJK UNIFIED IDEOGRAPH-20000
|
||||||
|
|
||||||
|
EXPECT_FALSE(isPrintable(0x10FFFF)); // noncharacter
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
} // namespace unicode
|
||||||
|
} // namespace sys
|
||||||
|
} // namespace llvm
|
Loading…
x
Reference in New Issue
Block a user