llvm-mirror/lib/Support/ConvertUTFWrapper.cpp

//===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----===
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//

#include "llvm/Support/ConvertUTF.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/SwapByteOrder.h"
#include <string>
#include <vector>

namespace llvm {

bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
                       char *&ResultPtr, const UTF8 *&ErrorPtr) {
  assert(WideCharWidth == 1 || WideCharWidth == 2 || WideCharWidth == 4);
  ConversionResult result = conversionOK;
  // Copy the character span over.
  if (WideCharWidth == 1) {
    const UTF8 *Pos = reinterpret_cast<const UTF8*>(Source.begin());
    if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) {
      result = sourceIllegal;
      ErrorPtr = Pos;
    } else {
      memcpy(ResultPtr, Source.data(), Source.size());
      ResultPtr += Source.size();
    }
  } else if (WideCharWidth == 2) {
    const UTF8 *sourceStart = (const UTF8*)Source.data();
    // FIXME: Make the type of the result buffer correct instead of
    // using reinterpret_cast.
    UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr);
    ConversionFlags flags = strictConversion;
    result = ConvertUTF8toUTF16(
        &sourceStart, sourceStart + Source.size(),
        &targetStart, targetStart + Source.size(), flags);
    if (result == conversionOK)
      ResultPtr = reinterpret_cast<char*>(targetStart);
    else
      ErrorPtr = sourceStart;
  } else if (WideCharWidth == 4) {
    const UTF8 *sourceStart = (const UTF8*)Source.data();
    // FIXME: Make the type of the result buffer correct instead of
    // using reinterpret_cast.
    UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr);
    ConversionFlags flags = strictConversion;
    result = ConvertUTF8toUTF32(
        &sourceStart, sourceStart + Source.size(),
        &targetStart, targetStart + Source.size(), flags);
    if (result == conversionOK)
      ResultPtr = reinterpret_cast<char*>(targetStart);
    else
      ErrorPtr = sourceStart;
  }
  assert((result != targetExhausted)
         && "ConvertUTF8toUTFXX exhausted target buffer");
  return result == conversionOK;
}

bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {
  const UTF32 *SourceStart = &Source;
  const UTF32 *SourceEnd = SourceStart + 1;
  UTF8 *TargetStart = reinterpret_cast<UTF8 *>(ResultPtr);
  UTF8 *TargetEnd = TargetStart + 4;
  ConversionResult CR = ConvertUTF32toUTF8(&SourceStart, SourceEnd,
                                           &TargetStart, TargetEnd,
                                           strictConversion);
  if (CR != conversionOK)
    return false;

  ResultPtr = reinterpret_cast<char*>(TargetStart);
  return true;
}

bool hasUTF16ByteOrderMark(ArrayRef<char> S) {
  return (S.size() >= 2 &&
          ((S[0] == '\xff' && S[1] == '\xfe') ||
           (S[0] == '\xfe' && S[1] == '\xff')));
}

bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
  assert(Out.empty());

  // Error out on an uneven byte count.
  if (SrcBytes.size() % 2)
    return false;

  // Avoid OOB by returning early on empty input.
  if (SrcBytes.empty())
    return true;

  const UTF16 *Src = reinterpret_cast<const UTF16 *>(SrcBytes.begin());
  const UTF16 *SrcEnd = reinterpret_cast<const UTF16 *>(SrcBytes.end());

  // Byteswap if necessary.
  std::vector<UTF16> ByteSwapped;
  if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) {
    ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);
    for (unsigned I = 0, E = ByteSwapped.size(); I != E; ++I)
      ByteSwapped[I] = llvm::sys::SwapByteOrder_16(ByteSwapped[I]);
    Src = &ByteSwapped[0];
    SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1;
  }

  // Skip the BOM for conversion.
  if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE)
    Src++;

  // Just allocate enough space up front.  We'll shrink it later.  Allocate
  // enough that we can fit a null terminator without reallocating.
  Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1);
  UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]);
  UTF8 *DstEnd = Dst + Out.size();

  ConversionResult CR =
      ConvertUTF16toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
  assert(CR != targetExhausted);

  if (CR != conversionOK) {
    Out.clear();
    return false;
  }

  Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]);
  Out.push_back(0);
  Out.pop_back();
  return true;
}

bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out)
{
  return convertUTF16ToUTF8String(
      llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),
      Src.size() * sizeof(UTF16)), Out);
}

bool convertUTF8ToUTF16String(StringRef SrcUTF8,
                              SmallVectorImpl<UTF16> &DstUTF16) {
  assert(DstUTF16.empty());

  // Avoid OOB by returning early on empty input.
  if (SrcUTF8.empty()) {
    DstUTF16.push_back(0);
    DstUTF16.pop_back();
    return true;
  }

  const UTF8 *Src = reinterpret_cast<const UTF8 *>(SrcUTF8.begin());
  const UTF8 *SrcEnd = reinterpret_cast<const UTF8 *>(SrcUTF8.end());

  // Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding
  // as UTF-16 should always require the same amount or less code units than the
  // UTF-8 encoding.  Allocate one extra byte for the null terminator though,
  // so that someone calling DstUTF16.data() gets a null terminated string.
  // We resize down later so we don't have to worry that this over allocates.
  DstUTF16.resize(SrcUTF8.size()+1);
  UTF16 *Dst = &DstUTF16[0];
  UTF16 *DstEnd = Dst + DstUTF16.size();

  ConversionResult CR =
      ConvertUTF8toUTF16(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
  assert(CR != targetExhausted);

  if (CR != conversionOK) {
    DstUTF16.clear();
    return false;
  }

  DstUTF16.resize(Dst - &DstUTF16[0]);
  DstUTF16.push_back(0);
  DstUTF16.pop_back();
  return true;
}

static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 ||
                  sizeof(wchar_t) == 4,
              "Expected wchar_t to be 1, 2, or 4 bytes");

template <typename TResult>
static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source,
                                             TResult &Result) {
  // Even in the case of UTF-16, the number of bytes in a UTF-8 string is
  // at least as large as the number of elements in the resulting wide
  // string, because surrogate pairs take at least 4 bytes in UTF-8.
  Result.resize(Source.size() + 1);
  char *ResultPtr = reinterpret_cast<char *>(&Result[0]);
  const UTF8 *ErrorPtr;
  if (!ConvertUTF8toWide(sizeof(wchar_t), Source, ResultPtr, ErrorPtr)) {
    Result.clear();
    return false;
  }
  Result.resize(reinterpret_cast<wchar_t *>(ResultPtr) - &Result[0]);
  return true;
}

bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result) {
  return ConvertUTF8toWideInternal(Source, Result);
}

bool ConvertUTF8toWide(const char *Source, std::wstring &Result) {
  if (!Source) {
    Result.clear();
    return true;
  }
  return ConvertUTF8toWide(llvm::StringRef(Source), Result);
}

bool convertWideToUTF8(const std::wstring &Source, std::string &Result) {
  if (sizeof(wchar_t) == 1) {
    const UTF8 *Start = reinterpret_cast<const UTF8 *>(Source.data());
    const UTF8 *End =
        reinterpret_cast<const UTF8 *>(Source.data() + Source.size());
    if (!isLegalUTF8String(&Start, End))
      return false;
    Result.resize(Source.size());
    memcpy(&Result[0], Source.data(), Source.size());
    return true;
  } else if (sizeof(wchar_t) == 2) {
    return convertUTF16ToUTF8String(
        llvm::ArrayRef<UTF16>(reinterpret_cast<const UTF16 *>(Source.data()),
                              Source.size()),
        Result);
  } else if (sizeof(wchar_t) == 4) {
    const UTF32 *Start = reinterpret_cast<const UTF32 *>(Source.data());
    const UTF32 *End =
        reinterpret_cast<const UTF32 *>(Source.data() + Source.size());
    Result.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT * Source.size());
    UTF8 *ResultPtr = reinterpret_cast<UTF8 *>(&Result[0]);
    UTF8 *ResultEnd = reinterpret_cast<UTF8 *>(&Result[0] + Result.size());
    if (ConvertUTF32toUTF8(&Start, End, &ResultPtr, ResultEnd,
                           strictConversion) == conversionOK) {
      Result.resize(reinterpret_cast<char *>(ResultPtr) - &Result[0]);
      return true;
    } else {
      Result.clear();
      return false;
    }
  } else {
    llvm_unreachable(
        "Control should never reach this point; see static_assert further up");
  }
}

} // end namespace llvm
Move UTF conversion routines from clang/lib/Basic to llvm/lib/Support This is required to use them in TableGen. llvm-svn: 173923 2013-01-30 13:05:05 +01:00			`//===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----===`
			`//`
			`// The LLVM Compiler Infrastructure`
			`//`
			`// This file is distributed under the University of Illinois Open Source`
			`// License. See LICENSE.TXT for details.`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "llvm/Support/ConvertUTF.h"`
[NFC] Header cleanup Removed some unused headers, replaced some headers with forward class declarations. Found using simple scripts like this one: clear && ack --cpp -l '#include "llvm/ADT/IndexedMap.h"' \| xargs grep -L 'IndexedMap[<]' \| xargs grep -n --color=auto 'IndexedMap' Patch by Eugene Kosov <claprix@yandex.ru> Differential Revision: http://reviews.llvm.org/D19219 From: Mehdi Amini <mehdi.amini@apple.com> llvm-svn: 266595 2016-04-18 11:17:29 +02:00			`#include "llvm/ADT/ArrayRef.h"`
			`#include "llvm/ADT/StringRef.h"`
More UTF string conversion wrappers Added new string conversion wrappers that convert between `std::string` (of UTF-8 bytes) and `std::wstring`, which is particularly useful for Win32 interop. Also fixed a missing string conversion for `getenv` on Win32, using these new wrappers. The motivation behind this is to provide the support functions required for LLDB to work properly on Windows with non-ASCII data; however, the functions are not LLDB specific. Patch by cameron314 Differential Revision: http://reviews.llvm.org/D17549 llvm-svn: 263247 2016-03-11 16:59:32 +01:00			`#include "llvm/Support/ErrorHandling.h"`
[Support] Add a Unicode conversion wrapper from UTF16 to UTF8 This is to support parsing UTF16 response files in LLVM/lib/Option for lld and clang. Reviewers: hans Differential Revision: http://llvm-reviews.chandlerc.com/D1138 llvm-svn: 186426 2013-07-16 19:14:33 +02:00			`#include "llvm/Support/SwapByteOrder.h"`
			`#include <string>`
			`#include <vector>`
Move UTF conversion routines from clang/lib/Basic to llvm/lib/Support This is required to use them in TableGen. llvm-svn: 173923 2013-01-30 13:05:05 +01:00
			`namespace llvm {`

			`bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,`
			`char &ResultPtr, const UTF8 &ErrorPtr) {`
			`assert(WideCharWidth == 1 \|\| WideCharWidth == 2 \|\| WideCharWidth == 4);`
			`ConversionResult result = conversionOK;`
			`// Copy the character span over.`
			`if (WideCharWidth == 1) {`
			`const UTF8 Pos = reinterpret_cast<const UTF8>(Source.begin());`
			`if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) {`
			`result = sourceIllegal;`
			`ErrorPtr = Pos;`
			`} else {`
			`memcpy(ResultPtr, Source.data(), Source.size());`
			`ResultPtr += Source.size();`
			`}`
			`} else if (WideCharWidth == 2) {`
			`const UTF8 sourceStart = (const UTF8)Source.data();`
			`// FIXME: Make the type of the result buffer correct instead of`
			`// using reinterpret_cast.`
			`UTF16 targetStart = reinterpret_cast<UTF16>(ResultPtr);`
			`ConversionFlags flags = strictConversion;`
			`result = ConvertUTF8toUTF16(`
			`&sourceStart, sourceStart + Source.size(),`
More UTF string conversion wrappers Added new string conversion wrappers that convert between `std::string` (of UTF-8 bytes) and `std::wstring`, which is particularly useful for Win32 interop. Also fixed a missing string conversion for `getenv` on Win32, using these new wrappers. The motivation behind this is to provide the support functions required for LLDB to work properly on Windows with non-ASCII data; however, the functions are not LLDB specific. Patch by cameron314 Differential Revision: http://reviews.llvm.org/D17549 llvm-svn: 263247 2016-03-11 16:59:32 +01:00			`&targetStart, targetStart + Source.size(), flags);`
Move UTF conversion routines from clang/lib/Basic to llvm/lib/Support This is required to use them in TableGen. llvm-svn: 173923 2013-01-30 13:05:05 +01:00			`if (result == conversionOK)`
			`ResultPtr = reinterpret_cast<char*>(targetStart);`
			`else`
			`ErrorPtr = sourceStart;`
			`} else if (WideCharWidth == 4) {`
			`const UTF8 sourceStart = (const UTF8)Source.data();`
			`// FIXME: Make the type of the result buffer correct instead of`
			`// using reinterpret_cast.`
			`UTF32 targetStart = reinterpret_cast<UTF32>(ResultPtr);`
			`ConversionFlags flags = strictConversion;`
			`result = ConvertUTF8toUTF32(`
			`&sourceStart, sourceStart + Source.size(),`
More UTF string conversion wrappers Added new string conversion wrappers that convert between `std::string` (of UTF-8 bytes) and `std::wstring`, which is particularly useful for Win32 interop. Also fixed a missing string conversion for `getenv` on Win32, using these new wrappers. The motivation behind this is to provide the support functions required for LLDB to work properly on Windows with non-ASCII data; however, the functions are not LLDB specific. Patch by cameron314 Differential Revision: http://reviews.llvm.org/D17549 llvm-svn: 263247 2016-03-11 16:59:32 +01:00			`&targetStart, targetStart + Source.size(), flags);`
Move UTF conversion routines from clang/lib/Basic to llvm/lib/Support This is required to use them in TableGen. llvm-svn: 173923 2013-01-30 13:05:05 +01:00			`if (result == conversionOK)`
			`ResultPtr = reinterpret_cast<char*>(targetStart);`
			`else`
			`ErrorPtr = sourceStart;`
			`}`
			`assert((result != targetExhausted)`
			`&& "ConvertUTF8toUTFXX exhausted target buffer");`
			`return result == conversionOK;`
			`}`

			`bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {`
			`const UTF32 *SourceStart = &Source;`
			`const UTF32 *SourceEnd = SourceStart + 1;`
			`UTF8 TargetStart = reinterpret_cast<UTF8 >(ResultPtr);`
			`UTF8 *TargetEnd = TargetStart + 4;`
			`ConversionResult CR = ConvertUTF32toUTF8(&SourceStart, SourceEnd,`
			`&TargetStart, TargetEnd,`
			`strictConversion);`
			`if (CR != conversionOK)`
			`return false;`

			`ResultPtr = reinterpret_cast<char*>(TargetStart);`
			`return true;`
			`}`

[Support] Add a Unicode conversion wrapper from UTF16 to UTF8 This is to support parsing UTF16 response files in LLVM/lib/Option for lld and clang. Reviewers: hans Differential Revision: http://llvm-reviews.chandlerc.com/D1138 llvm-svn: 186426 2013-07-16 19:14:33 +02:00			`bool hasUTF16ByteOrderMark(ArrayRef<char> S) {`
			`return (S.size() >= 2 &&`
			`((S[0] == '\xff' && S[1] == '\xfe') \|\|`
			`(S[0] == '\xfe' && S[1] == '\xff')));`
			`}`

			`bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {`
			`assert(Out.empty());`

			`// Error out on an uneven byte count.`
			`if (SrcBytes.size() % 2)`
			`return false;`

			`// Avoid OOB by returning early on empty input.`
			`if (SrcBytes.empty())`
			`return true;`

			`const UTF16 Src = reinterpret_cast<const UTF16 >(SrcBytes.begin());`
			`const UTF16 SrcEnd = reinterpret_cast<const UTF16 >(SrcBytes.end());`

			`// Byteswap if necessary.`
			`std::vector<UTF16> ByteSwapped;`
			`if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) {`
			`ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);`
			`for (unsigned I = 0, E = ByteSwapped.size(); I != E; ++I)`
			`ByteSwapped[I] = llvm::sys::SwapByteOrder_16(ByteSwapped[I]);`
			`Src = &ByteSwapped[0];`
			`SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1;`
			`}`

			`// Skip the BOM for conversion.`
			`if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE)`
			`Src++;`

Have the UTF conversion wrappers append a null terminator. This is especially useful for the UTF8 -> UTF16 direction, since there is no equivalent of llvm::SmallString<> for wide characters. This means that anyone who wants a null terminated string is forced to manually push and pop their own null terminator. Reviewed by: Reid Kleckner. llvm-svn: 227143 2015-01-26 23:05:50 +01:00			`// Just allocate enough space up front. We'll shrink it later. Allocate`
			`// enough that we can fit a null terminator without reallocating.`
			`Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1);`
[Support] Add a Unicode conversion wrapper from UTF16 to UTF8 This is to support parsing UTF16 response files in LLVM/lib/Option for lld and clang. Reviewers: hans Differential Revision: http://llvm-reviews.chandlerc.com/D1138 llvm-svn: 186426 2013-07-16 19:14:33 +02:00			`UTF8 Dst = reinterpret_cast<UTF8 >(&Out[0]);`
			`UTF8 *DstEnd = Dst + Out.size();`

			`ConversionResult CR =`
			`ConvertUTF16toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);`
			`assert(CR != targetExhausted);`

			`if (CR != conversionOK) {`
			`Out.clear();`
			`return false;`
			`}`

			`Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]);`
Have the UTF conversion wrappers append a null terminator. This is especially useful for the UTF8 -> UTF16 direction, since there is no equivalent of llvm::SmallString<> for wide characters. This means that anyone who wants a null terminated string is forced to manually push and pop their own null terminator. Reviewed by: Reid Kleckner. llvm-svn: 227143 2015-01-26 23:05:50 +01:00			`Out.push_back(0);`
			`Out.pop_back();`
[Support] Add a Unicode conversion wrapper from UTF16 to UTF8 This is to support parsing UTF16 response files in LLVM/lib/Option for lld and clang. Reviewers: hans Differential Revision: http://llvm-reviews.chandlerc.com/D1138 llvm-svn: 186426 2013-07-16 19:14:33 +02:00			`return true;`
			`}`

More UTF string conversion wrappers Added new string conversion wrappers that convert between `std::string` (of UTF-8 bytes) and `std::wstring`, which is particularly useful for Win32 interop. Also fixed a missing string conversion for `getenv` on Win32, using these new wrappers. The motivation behind this is to provide the support functions required for LLDB to work properly on Windows with non-ASCII data; however, the functions are not LLDB specific. Patch by cameron314 Differential Revision: http://reviews.llvm.org/D17549 llvm-svn: 263247 2016-03-11 16:59:32 +01:00			`bool convertUTF16ToUTF8String(ArrayRef<UTF16> Src, std::string &Out)`
			`{`
			`return convertUTF16ToUTF8String(`
			`llvm::ArrayRef<char>(reinterpret_cast<const char *>(Src.data()),`
			`Src.size() * sizeof(UTF16)), Out);`
			`}`

Add a UTF8 to UTF16 conversion wrapper for use in the pdb dumper This can also be used instead of the WindowsSupport.h ConvertUTF8ToUTF16 helpers, but that will require massaging some character types. The Windows support routines want wchar_t output, but wchar_t is often 32 bits on non-Windows OSs. llvm-svn: 227122 2015-01-26 20:51:00 +01:00			`bool convertUTF8ToUTF16String(StringRef SrcUTF8,`
			`SmallVectorImpl<UTF16> &DstUTF16) {`
			`assert(DstUTF16.empty());`

			`// Avoid OOB by returning early on empty input.`
Make UTF8->UTF16 conversion null terminate output on empty input. llvm-svn: 228527 2015-02-08 19:08:51 +01:00			`if (SrcUTF8.empty()) {`
			`DstUTF16.push_back(0);`
			`DstUTF16.pop_back();`
Add a UTF8 to UTF16 conversion wrapper for use in the pdb dumper This can also be used instead of the WindowsSupport.h ConvertUTF8ToUTF16 helpers, but that will require massaging some character types. The Windows support routines want wchar_t output, but wchar_t is often 32 bits on non-Windows OSs. llvm-svn: 227122 2015-01-26 20:51:00 +01:00			`return true;`
Make UTF8->UTF16 conversion null terminate output on empty input. llvm-svn: 228527 2015-02-08 19:08:51 +01:00			`}`
Add a UTF8 to UTF16 conversion wrapper for use in the pdb dumper This can also be used instead of the WindowsSupport.h ConvertUTF8ToUTF16 helpers, but that will require massaging some character types. The Windows support routines want wchar_t output, but wchar_t is often 32 bits on non-Windows OSs. llvm-svn: 227122 2015-01-26 20:51:00 +01:00
			`const UTF8 Src = reinterpret_cast<const UTF8 >(SrcUTF8.begin());`
			`const UTF8 SrcEnd = reinterpret_cast<const UTF8 >(SrcUTF8.end());`

			`// Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding`
			`// as UTF-16 should always require the same amount or less code units than the`
Have the UTF conversion wrappers append a null terminator. This is especially useful for the UTF8 -> UTF16 direction, since there is no equivalent of llvm::SmallString<> for wide characters. This means that anyone who wants a null terminated string is forced to manually push and pop their own null terminator. Reviewed by: Reid Kleckner. llvm-svn: 227143 2015-01-26 23:05:50 +01:00			`// UTF-8 encoding. Allocate one extra byte for the null terminator though,`
			`// so that someone calling DstUTF16.data() gets a null terminated string.`
			`// We resize down later so we don't have to worry that this over allocates.`
			`DstUTF16.resize(SrcUTF8.size()+1);`
Add a UTF8 to UTF16 conversion wrapper for use in the pdb dumper This can also be used instead of the WindowsSupport.h ConvertUTF8ToUTF16 helpers, but that will require massaging some character types. The Windows support routines want wchar_t output, but wchar_t is often 32 bits on non-Windows OSs. llvm-svn: 227122 2015-01-26 20:51:00 +01:00			`UTF16 *Dst = &DstUTF16[0];`
			`UTF16 *DstEnd = Dst + DstUTF16.size();`

			`ConversionResult CR =`
			`ConvertUTF8toUTF16(&Src, SrcEnd, &Dst, DstEnd, strictConversion);`
			`assert(CR != targetExhausted);`

			`if (CR != conversionOK) {`
			`DstUTF16.clear();`
			`return false;`
			`}`

			`DstUTF16.resize(Dst - &DstUTF16[0]);`
Have the UTF conversion wrappers append a null terminator. This is especially useful for the UTF8 -> UTF16 direction, since there is no equivalent of llvm::SmallString<> for wide characters. This means that anyone who wants a null terminated string is forced to manually push and pop their own null terminator. Reviewed by: Reid Kleckner. llvm-svn: 227143 2015-01-26 23:05:50 +01:00			`DstUTF16.push_back(0);`
			`DstUTF16.pop_back();`
Add a UTF8 to UTF16 conversion wrapper for use in the pdb dumper This can also be used instead of the WindowsSupport.h ConvertUTF8ToUTF16 helpers, but that will require massaging some character types. The Windows support routines want wchar_t output, but wchar_t is often 32 bits on non-Windows OSs. llvm-svn: 227122 2015-01-26 20:51:00 +01:00			`return true;`
			`}`

More UTF string conversion wrappers Added new string conversion wrappers that convert between `std::string` (of UTF-8 bytes) and `std::wstring`, which is particularly useful for Win32 interop. Also fixed a missing string conversion for `getenv` on Win32, using these new wrappers. The motivation behind this is to provide the support functions required for LLDB to work properly on Windows with non-ASCII data; however, the functions are not LLDB specific. Patch by cameron314 Differential Revision: http://reviews.llvm.org/D17549 llvm-svn: 263247 2016-03-11 16:59:32 +01:00			`static_assert(sizeof(wchar_t) == 1 \|\| sizeof(wchar_t) == 2 \|\|`
			`sizeof(wchar_t) == 4,`
			`"Expected wchar_t to be 1, 2, or 4 bytes");`

			`template <typename TResult>`
			`static inline bool ConvertUTF8toWideInternal(llvm::StringRef Source,`
			`TResult &Result) {`
			`// Even in the case of UTF-16, the number of bytes in a UTF-8 string is`
			`// at least as large as the number of elements in the resulting wide`
			`// string, because surrogate pairs take at least 4 bytes in UTF-8.`
			`Result.resize(Source.size() + 1);`
			`char ResultPtr = reinterpret_cast<char >(&Result[0]);`
			`const UTF8 *ErrorPtr;`
			`if (!ConvertUTF8toWide(sizeof(wchar_t), Source, ResultPtr, ErrorPtr)) {`
			`Result.clear();`
			`return false;`
			`}`
			`Result.resize(reinterpret_cast<wchar_t *>(ResultPtr) - &Result[0]);`
			`return true;`
			`}`

			`bool ConvertUTF8toWide(llvm::StringRef Source, std::wstring &Result) {`
			`return ConvertUTF8toWideInternal(Source, Result);`
			`}`

			`bool ConvertUTF8toWide(const char *Source, std::wstring &Result) {`
			`if (!Source) {`
			`Result.clear();`
			`return true;`
			`}`
			`return ConvertUTF8toWide(llvm::StringRef(Source), Result);`
			`}`

			`bool convertWideToUTF8(const std::wstring &Source, std::string &Result) {`
			`if (sizeof(wchar_t) == 1) {`
			`const UTF8 Start = reinterpret_cast<const UTF8 >(Source.data());`
			`const UTF8 *End =`
			`reinterpret_cast<const UTF8 *>(Source.data() + Source.size());`
			`if (!isLegalUTF8String(&Start, End))`
			`return false;`
			`Result.resize(Source.size());`
			`memcpy(&Result[0], Source.data(), Source.size());`
			`return true;`
			`} else if (sizeof(wchar_t) == 2) {`
			`return convertUTF16ToUTF8String(`
			`llvm::ArrayRef<UTF16>(reinterpret_cast<const UTF16 *>(Source.data()),`
			`Source.size()),`
			`Result);`
			`} else if (sizeof(wchar_t) == 4) {`
			`const UTF32 Start = reinterpret_cast<const UTF32 >(Source.data());`
			`const UTF32 *End =`
			`reinterpret_cast<const UTF32 *>(Source.data() + Source.size());`
			`Result.resize(UNI_MAX_UTF8_BYTES_PER_CODE_POINT * Source.size());`
			`UTF8 ResultPtr = reinterpret_cast<UTF8 >(&Result[0]);`
			`UTF8 ResultEnd = reinterpret_cast<UTF8 >(&Result[0] + Result.size());`
			`if (ConvertUTF32toUTF8(&Start, End, &ResultPtr, ResultEnd,`
			`strictConversion) == conversionOK) {`
			`Result.resize(reinterpret_cast<char *>(ResultPtr) - &Result[0]);`
			`return true;`
			`} else {`
			`Result.clear();`
			`return false;`
			`}`
			`} else {`
			`llvm_unreachable(`
			`"Control should never reach this point; see static_assert further up");`
			`}`
			`}`

Move UTF conversion routines from clang/lib/Basic to llvm/lib/Support This is required to use them in TableGen. llvm-svn: 173923 2013-01-30 13:05:05 +01:00			`} // end namespace llvm`