llvm-mirror/lib/Support/ConvertUTFWrapper.cpp

//===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----===
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//

#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/SwapByteOrder.h"
#include <string>
#include <vector>

namespace llvm {

bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,
                       char *&ResultPtr, const UTF8 *&ErrorPtr) {
  assert(WideCharWidth == 1 || WideCharWidth == 2 || WideCharWidth == 4);
  ConversionResult result = conversionOK;
  // Copy the character span over.
  if (WideCharWidth == 1) {
    const UTF8 *Pos = reinterpret_cast<const UTF8*>(Source.begin());
    if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) {
      result = sourceIllegal;
      ErrorPtr = Pos;
    } else {
      memcpy(ResultPtr, Source.data(), Source.size());
      ResultPtr += Source.size();
    }
  } else if (WideCharWidth == 2) {
    const UTF8 *sourceStart = (const UTF8*)Source.data();
    // FIXME: Make the type of the result buffer correct instead of
    // using reinterpret_cast.
    UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr);
    ConversionFlags flags = strictConversion;
    result = ConvertUTF8toUTF16(
        &sourceStart, sourceStart + Source.size(),
        &targetStart, targetStart + 2*Source.size(), flags);
    if (result == conversionOK)
      ResultPtr = reinterpret_cast<char*>(targetStart);
    else
      ErrorPtr = sourceStart;
  } else if (WideCharWidth == 4) {
    const UTF8 *sourceStart = (const UTF8*)Source.data();
    // FIXME: Make the type of the result buffer correct instead of
    // using reinterpret_cast.
    UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr);
    ConversionFlags flags = strictConversion;
    result = ConvertUTF8toUTF32(
        &sourceStart, sourceStart + Source.size(),
        &targetStart, targetStart + 4*Source.size(), flags);
    if (result == conversionOK)
      ResultPtr = reinterpret_cast<char*>(targetStart);
    else
      ErrorPtr = sourceStart;
  }
  assert((result != targetExhausted)
         && "ConvertUTF8toUTFXX exhausted target buffer");
  return result == conversionOK;
}

bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {
  const UTF32 *SourceStart = &Source;
  const UTF32 *SourceEnd = SourceStart + 1;
  UTF8 *TargetStart = reinterpret_cast<UTF8 *>(ResultPtr);
  UTF8 *TargetEnd = TargetStart + 4;
  ConversionResult CR = ConvertUTF32toUTF8(&SourceStart, SourceEnd,
                                           &TargetStart, TargetEnd,
                                           strictConversion);
  if (CR != conversionOK)
    return false;

  ResultPtr = reinterpret_cast<char*>(TargetStart);
  return true;
}

bool hasUTF16ByteOrderMark(ArrayRef<char> S) {
  return (S.size() >= 2 &&
          ((S[0] == '\xff' && S[1] == '\xfe') ||
           (S[0] == '\xfe' && S[1] == '\xff')));
}

bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
  assert(Out.empty());

  // Error out on an uneven byte count.
  if (SrcBytes.size() % 2)
    return false;

  // Avoid OOB by returning early on empty input.
  if (SrcBytes.empty())
    return true;

  const UTF16 *Src = reinterpret_cast<const UTF16 *>(SrcBytes.begin());
  const UTF16 *SrcEnd = reinterpret_cast<const UTF16 *>(SrcBytes.end());

  // Byteswap if necessary.
  std::vector<UTF16> ByteSwapped;
  if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) {
    ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);
    for (unsigned I = 0, E = ByteSwapped.size(); I != E; ++I)
      ByteSwapped[I] = llvm::sys::SwapByteOrder_16(ByteSwapped[I]);
    Src = &ByteSwapped[0];
    SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1;
  }

  // Skip the BOM for conversion.
  if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE)
    Src++;

  // Just allocate enough space up front.  We'll shrink it later.  Allocate
  // enough that we can fit a null terminator without reallocating.
  Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1);
  UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]);
  UTF8 *DstEnd = Dst + Out.size();

  ConversionResult CR =
      ConvertUTF16toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
  assert(CR != targetExhausted);

  if (CR != conversionOK) {
    Out.clear();
    return false;
  }

  Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]);
  Out.push_back(0);
  Out.pop_back();
  return true;
}

bool convertUTF8ToUTF16String(StringRef SrcUTF8,
                              SmallVectorImpl<UTF16> &DstUTF16) {
  assert(DstUTF16.empty());

  // Avoid OOB by returning early on empty input.
  if (SrcUTF8.empty()) {
    DstUTF16.push_back(0);
    DstUTF16.pop_back();
    return true;
  }

  const UTF8 *Src = reinterpret_cast<const UTF8 *>(SrcUTF8.begin());
  const UTF8 *SrcEnd = reinterpret_cast<const UTF8 *>(SrcUTF8.end());

  // Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding
  // as UTF-16 should always require the same amount or less code units than the
  // UTF-8 encoding.  Allocate one extra byte for the null terminator though,
  // so that someone calling DstUTF16.data() gets a null terminated string.
  // We resize down later so we don't have to worry that this over allocates.
  DstUTF16.resize(SrcUTF8.size()+1);
  UTF16 *Dst = &DstUTF16[0];
  UTF16 *DstEnd = Dst + DstUTF16.size();

  ConversionResult CR =
      ConvertUTF8toUTF16(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
  assert(CR != targetExhausted);

  if (CR != conversionOK) {
    DstUTF16.clear();
    return false;
  }

  DstUTF16.resize(Dst - &DstUTF16[0]);
  DstUTF16.push_back(0);
  DstUTF16.pop_back();
  return true;
}

} // end namespace llvm
Move UTF conversion routines from clang/lib/Basic to llvm/lib/Support This is required to use them in TableGen. llvm-svn: 173923 2013-01-30 13:05:05 +01:00			`//===-- ConvertUTFWrapper.cpp - Wrap ConvertUTF.h with clang data types -----===`
			`//`
			`// The LLVM Compiler Infrastructure`
			`//`
			`// This file is distributed under the University of Illinois Open Source`
			`// License. See LICENSE.TXT for details.`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "llvm/Support/ConvertUTF.h"`
[Support] Add a Unicode conversion wrapper from UTF16 to UTF8 This is to support parsing UTF16 response files in LLVM/lib/Option for lld and clang. Reviewers: hans Differential Revision: http://llvm-reviews.chandlerc.com/D1138 llvm-svn: 186426 2013-07-16 19:14:33 +02:00			`#include "llvm/Support/SwapByteOrder.h"`
			`#include <string>`
			`#include <vector>`
Move UTF conversion routines from clang/lib/Basic to llvm/lib/Support This is required to use them in TableGen. llvm-svn: 173923 2013-01-30 13:05:05 +01:00
			`namespace llvm {`

			`bool ConvertUTF8toWide(unsigned WideCharWidth, llvm::StringRef Source,`
			`char &ResultPtr, const UTF8 &ErrorPtr) {`
			`assert(WideCharWidth == 1 \|\| WideCharWidth == 2 \|\| WideCharWidth == 4);`
			`ConversionResult result = conversionOK;`
			`// Copy the character span over.`
			`if (WideCharWidth == 1) {`
			`const UTF8 Pos = reinterpret_cast<const UTF8>(Source.begin());`
			`if (!isLegalUTF8String(&Pos, reinterpret_cast<const UTF8*>(Source.end()))) {`
			`result = sourceIllegal;`
			`ErrorPtr = Pos;`
			`} else {`
			`memcpy(ResultPtr, Source.data(), Source.size());`
			`ResultPtr += Source.size();`
			`}`
			`} else if (WideCharWidth == 2) {`
			`const UTF8 sourceStart = (const UTF8)Source.data();`
			`// FIXME: Make the type of the result buffer correct instead of`
			`// using reinterpret_cast.`
			`UTF16 targetStart = reinterpret_cast<UTF16>(ResultPtr);`
			`ConversionFlags flags = strictConversion;`
			`result = ConvertUTF8toUTF16(`
			`&sourceStart, sourceStart + Source.size(),`
			`&targetStart, targetStart + 2*Source.size(), flags);`
			`if (result == conversionOK)`
			`ResultPtr = reinterpret_cast<char*>(targetStart);`
			`else`
			`ErrorPtr = sourceStart;`
			`} else if (WideCharWidth == 4) {`
			`const UTF8 sourceStart = (const UTF8)Source.data();`
			`// FIXME: Make the type of the result buffer correct instead of`
			`// using reinterpret_cast.`
			`UTF32 targetStart = reinterpret_cast<UTF32>(ResultPtr);`
			`ConversionFlags flags = strictConversion;`
			`result = ConvertUTF8toUTF32(`
			`&sourceStart, sourceStart + Source.size(),`
			`&targetStart, targetStart + 4*Source.size(), flags);`
			`if (result == conversionOK)`
			`ResultPtr = reinterpret_cast<char*>(targetStart);`
			`else`
			`ErrorPtr = sourceStart;`
			`}`
			`assert((result != targetExhausted)`
			`&& "ConvertUTF8toUTFXX exhausted target buffer");`
			`return result == conversionOK;`
			`}`

			`bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {`
			`const UTF32 *SourceStart = &Source;`
			`const UTF32 *SourceEnd = SourceStart + 1;`
			`UTF8 TargetStart = reinterpret_cast<UTF8 >(ResultPtr);`
			`UTF8 *TargetEnd = TargetStart + 4;`
			`ConversionResult CR = ConvertUTF32toUTF8(&SourceStart, SourceEnd,`
			`&TargetStart, TargetEnd,`
			`strictConversion);`
			`if (CR != conversionOK)`
			`return false;`

			`ResultPtr = reinterpret_cast<char*>(TargetStart);`
			`return true;`
			`}`

[Support] Add a Unicode conversion wrapper from UTF16 to UTF8 This is to support parsing UTF16 response files in LLVM/lib/Option for lld and clang. Reviewers: hans Differential Revision: http://llvm-reviews.chandlerc.com/D1138 llvm-svn: 186426 2013-07-16 19:14:33 +02:00			`bool hasUTF16ByteOrderMark(ArrayRef<char> S) {`
			`return (S.size() >= 2 &&`
			`((S[0] == '\xff' && S[1] == '\xfe') \|\|`
			`(S[0] == '\xfe' && S[1] == '\xff')));`
			`}`

			`bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {`
			`assert(Out.empty());`

			`// Error out on an uneven byte count.`
			`if (SrcBytes.size() % 2)`
			`return false;`

			`// Avoid OOB by returning early on empty input.`
			`if (SrcBytes.empty())`
			`return true;`

			`const UTF16 Src = reinterpret_cast<const UTF16 >(SrcBytes.begin());`
			`const UTF16 SrcEnd = reinterpret_cast<const UTF16 >(SrcBytes.end());`

			`// Byteswap if necessary.`
			`std::vector<UTF16> ByteSwapped;`
			`if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) {`
			`ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);`
			`for (unsigned I = 0, E = ByteSwapped.size(); I != E; ++I)`
			`ByteSwapped[I] = llvm::sys::SwapByteOrder_16(ByteSwapped[I]);`
			`Src = &ByteSwapped[0];`
			`SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1;`
			`}`

			`// Skip the BOM for conversion.`
			`if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE)`
			`Src++;`

Have the UTF conversion wrappers append a null terminator. This is especially useful for the UTF8 -> UTF16 direction, since there is no equivalent of llvm::SmallString<> for wide characters. This means that anyone who wants a null terminated string is forced to manually push and pop their own null terminator. Reviewed by: Reid Kleckner. llvm-svn: 227143 2015-01-26 23:05:50 +01:00			`// Just allocate enough space up front. We'll shrink it later. Allocate`
			`// enough that we can fit a null terminator without reallocating.`
			`Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT + 1);`
[Support] Add a Unicode conversion wrapper from UTF16 to UTF8 This is to support parsing UTF16 response files in LLVM/lib/Option for lld and clang. Reviewers: hans Differential Revision: http://llvm-reviews.chandlerc.com/D1138 llvm-svn: 186426 2013-07-16 19:14:33 +02:00			`UTF8 Dst = reinterpret_cast<UTF8 >(&Out[0]);`
			`UTF8 *DstEnd = Dst + Out.size();`

			`ConversionResult CR =`
			`ConvertUTF16toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);`
			`assert(CR != targetExhausted);`

			`if (CR != conversionOK) {`
			`Out.clear();`
			`return false;`
			`}`

			`Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]);`
Have the UTF conversion wrappers append a null terminator. This is especially useful for the UTF8 -> UTF16 direction, since there is no equivalent of llvm::SmallString<> for wide characters. This means that anyone who wants a null terminated string is forced to manually push and pop their own null terminator. Reviewed by: Reid Kleckner. llvm-svn: 227143 2015-01-26 23:05:50 +01:00			`Out.push_back(0);`
			`Out.pop_back();`
[Support] Add a Unicode conversion wrapper from UTF16 to UTF8 This is to support parsing UTF16 response files in LLVM/lib/Option for lld and clang. Reviewers: hans Differential Revision: http://llvm-reviews.chandlerc.com/D1138 llvm-svn: 186426 2013-07-16 19:14:33 +02:00			`return true;`
			`}`

Add a UTF8 to UTF16 conversion wrapper for use in the pdb dumper This can also be used instead of the WindowsSupport.h ConvertUTF8ToUTF16 helpers, but that will require massaging some character types. The Windows support routines want wchar_t output, but wchar_t is often 32 bits on non-Windows OSs. llvm-svn: 227122 2015-01-26 20:51:00 +01:00			`bool convertUTF8ToUTF16String(StringRef SrcUTF8,`
			`SmallVectorImpl<UTF16> &DstUTF16) {`
			`assert(DstUTF16.empty());`

			`// Avoid OOB by returning early on empty input.`
Make UTF8->UTF16 conversion null terminate output on empty input. llvm-svn: 228527 2015-02-08 19:08:51 +01:00			`if (SrcUTF8.empty()) {`
			`DstUTF16.push_back(0);`
			`DstUTF16.pop_back();`
Add a UTF8 to UTF16 conversion wrapper for use in the pdb dumper This can also be used instead of the WindowsSupport.h ConvertUTF8ToUTF16 helpers, but that will require massaging some character types. The Windows support routines want wchar_t output, but wchar_t is often 32 bits on non-Windows OSs. llvm-svn: 227122 2015-01-26 20:51:00 +01:00			`return true;`
Make UTF8->UTF16 conversion null terminate output on empty input. llvm-svn: 228527 2015-02-08 19:08:51 +01:00			`}`
Add a UTF8 to UTF16 conversion wrapper for use in the pdb dumper This can also be used instead of the WindowsSupport.h ConvertUTF8ToUTF16 helpers, but that will require massaging some character types. The Windows support routines want wchar_t output, but wchar_t is often 32 bits on non-Windows OSs. llvm-svn: 227122 2015-01-26 20:51:00 +01:00
			`const UTF8 Src = reinterpret_cast<const UTF8 >(SrcUTF8.begin());`
			`const UTF8 SrcEnd = reinterpret_cast<const UTF8 >(SrcUTF8.end());`

			`// Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding`
			`// as UTF-16 should always require the same amount or less code units than the`
Have the UTF conversion wrappers append a null terminator. This is especially useful for the UTF8 -> UTF16 direction, since there is no equivalent of llvm::SmallString<> for wide characters. This means that anyone who wants a null terminated string is forced to manually push and pop their own null terminator. Reviewed by: Reid Kleckner. llvm-svn: 227143 2015-01-26 23:05:50 +01:00			`// UTF-8 encoding. Allocate one extra byte for the null terminator though,`
			`// so that someone calling DstUTF16.data() gets a null terminated string.`
			`// We resize down later so we don't have to worry that this over allocates.`
			`DstUTF16.resize(SrcUTF8.size()+1);`
Add a UTF8 to UTF16 conversion wrapper for use in the pdb dumper This can also be used instead of the WindowsSupport.h ConvertUTF8ToUTF16 helpers, but that will require massaging some character types. The Windows support routines want wchar_t output, but wchar_t is often 32 bits on non-Windows OSs. llvm-svn: 227122 2015-01-26 20:51:00 +01:00			`UTF16 *Dst = &DstUTF16[0];`
			`UTF16 *DstEnd = Dst + DstUTF16.size();`

			`ConversionResult CR =`
			`ConvertUTF8toUTF16(&Src, SrcEnd, &Dst, DstEnd, strictConversion);`
			`assert(CR != targetExhausted);`

			`if (CR != conversionOK) {`
			`DstUTF16.clear();`
			`return false;`
			`}`

			`DstUTF16.resize(Dst - &DstUTF16[0]);`
Have the UTF conversion wrappers append a null terminator. This is especially useful for the UTF8 -> UTF16 direction, since there is no equivalent of llvm::SmallString<> for wide characters. This means that anyone who wants a null terminated string is forced to manually push and pop their own null terminator. Reviewed by: Reid Kleckner. llvm-svn: 227143 2015-01-26 23:05:50 +01:00			`DstUTF16.push_back(0);`
			`DstUTF16.pop_back();`
Add a UTF8 to UTF16 conversion wrapper for use in the pdb dumper This can also be used instead of the WindowsSupport.h ConvertUTF8ToUTF16 helpers, but that will require massaging some character types. The Windows support routines want wchar_t output, but wchar_t is often 32 bits on non-Windows OSs. llvm-svn: 227122 2015-01-26 20:51:00 +01:00			`return true;`
			`}`

Move UTF conversion routines from clang/lib/Basic to llvm/lib/Support This is required to use them in TableGen. llvm-svn: 173923 2013-01-30 13:05:05 +01:00			`} // end namespace llvm`