Warning: file_get_contents(/data/phpspider/zhask/data//catemap/6/cplusplus/128.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181

Warning: file_get_contents(/data/phpspider/zhask/data//catemap/8/selenium/4.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
在Linux中将UTF-32宽字符转换为UTF-16宽字符作为补充平面字符 我们使用ICU,./P>在RHEL上部署了C++应用程序。_C++_Linux_Icu_Wchar T - Fatal编程技术网

在Linux中将UTF-32宽字符转换为UTF-16宽字符作为补充平面字符 我们使用ICU,./P>在RHEL上部署了C++应用程序。

在Linux中将UTF-32宽字符转换为UTF-16宽字符作为补充平面字符 我们使用ICU,./P>在RHEL上部署了C++应用程序。,c++,linux,icu,wchar-t,C++,Linux,Icu,Wchar T,在linux上,我们需要将UChar*转换为wchar\u t*。我们使用u_strotwcs来执行转换 #include <iostream> #include <wchar.h> #include "unicode/ustring.h" void convertUnicodeStringtoWideChar(const UChar* cuniszSource, const int32_t cu

在linux上,我们需要将UChar*转换为wchar\u t*。我们使用u_strotwcs来执行转换

#include <iostream>
#include <wchar.h>

#include "unicode/ustring.h"

void convertUnicodeStringtoWideChar(const UChar* cuniszSource,
                                    const int32_t cunii32SourceLength,
                                    wchar_t*& rpwcharDestination,
                                    int32_t& destCapacity)
{
  UErrorCode uniUErrorCode = U_ZERO_ERROR;

  int32_t pDestLength = 0;

  rpwcharDestination     = 0;
  destCapacity = 0;

  u_strToWCS(rpwcharDestination,
             destCapacity,
             &pDestLength,
             cuniszSource,
             cunii32SourceLength,
             &uniUErrorCode);

  uniUErrorCode = U_ZERO_ERROR;
  rpwcharDestination = new wchar_t[pDestLength+1];
  if(rpwcharDestination)
  {
    destCapacity = pDestLength+1;

    u_strToWCS(rpwcharDestination,
               destCapacity,
               &pDestLength,
               cuniszSource,
               cunii32SourceLength,
               &uniUErrorCode);

    destCapacity = wcslen(rpwcharDestination);
  }
} //function ends

int main()
{
    //                     a       ä       Š       €    (     In C++11 and later, this conversion is in the standard library, in the 
<codecvt>
header. Here is some sample code that converts between UTF-16, UCS-4 and
wchar_t
. (It breaks on libstdc++ 6.4.9 due to a bug that has been fixed in the development tree.)

#include <codecvt>
#include <cstdlib>
#include <cstring>
#include <cwctype>
#include <iostream>
#include <locale>
#include <vector>

using std::cout;
using std::endl;
using std::exit;
using std::memcmp;
using std::size_t;

using std::wcout;

int main(void)
{
  constexpr char16_t msg_utf16[] = u"¡Hola, mundo! \U0001F600"; // Shouldn't assume endianness.
  constexpr wchar_t msg_w[] = L"¡Hola, mundo! \U0001F600";
  constexpr char32_t msg_utf32[] = U"¡Hola, mundo! \U0001F600";
  constexpr char msg_utf8[] = u8"¡Hola, mundo! \U0001F600";

  // May vary from OS to OS>  "" is the most standard, but might require, e.g. "en_US.utf8".
  constexpr char locale_name[] = "";
  std::locale::global(std::locale(locale_name)); //
  wcout.imbue(std::locale());

  const std::codecvt_utf16<wchar_t, 0x1FFFF, std::little_endian> converter_w;
  const size_t max_len = sizeof(msg_utf16);
  std::vector<char> out(max_len);
  std::mbstate_t state;
  const wchar_t* from_w = nullptr;
  char* to_next = nullptr;

  converter_w.out( state, msg_w, msg_w+sizeof(msg_w)/sizeof(wchar_t), from_w, out.data(), out.data() + out.size(), to_next );

  
  if (memcmp( msg_utf8, out.data(), sizeof(msg_utf8) ) == 0 ) {
    wcout << L"std::codecvt_utf16<wchar_t> converts to UTF-8, not UTF-16!" << endl;
  } else if ( memcmp( msg_utf16, out.data(), max_len ) != 0 ) {
    wcout << L"std::codecvt_utf16<wchar_t> conversion not equal!" << endl;
  } else {
    wcout << L"std::codecvt_utf16<wchar_t> conversion is correct." << endl;
  }
  out.clear();
  out.resize(max_len);

  const std::codecvt_utf16<char32_t, 0x1FFFF, std::little_endian> converter_u32;
  const char32_t* from_u32 = nullptr;
  converter_u32.out( state, msg_utf32, msg_utf32+sizeof(msg_utf32)/sizeof(char32_t), from_u32, out.data(), out.data() + out.size(), to_next );

  if ( memcmp( msg_utf16, out.data(), max_len ) != 0 ) {
    wcout << L"std::codecvt_utf16<char32_t> conversion not equal!" << endl;
  } else {
    wcout << L"std::codecvt_utf16<char32_t> conversion is correct." << endl;
  }

  wcout << msg_w << endl;
  return EXIT_SUCCESS;
}
#包括
#包括
#包括“unicode/usting.h”
void convertUnicode DestingToWideChar(常量UChar*cuniszSource,
常量int32\u t cunii32SourceLength,
wchar_t*&RPWChard估计,
int32_t&dest容量)
{
UErrorCode uniUErrorCode=U_零_错误;
int32_t pDestLength=0;
rpwcharDestination=0;
容量=0;
u_STROTOWCS(RPWChardDestination,
容量,
&pDestLength,
cuniszSource,
cunii32SourceLength,
&统一代码);
uniUErrorCode=U_零_错误;
rpwcharDestination=新的wchar_t[pDestLength+1];
if(rpwcharDestination)
{
destCapacity=pDestLength+1;
u_STROTOWCS(RPWChardDestination,
容量,
&pDestLength,
cuniszSource,
cunii32SourceLength,
&统一代码);
destCapacity=wcslen(rpwcharDestination);
}
}//函数结束
int main()
{

//a䊀(在C++11及更高版本中,此转换位于标准库中的
标题中。以下是一些在UTF-16、UCS-4和
wchar\u t
之间转换的示例代码(由于开发树中已修复的错误,它在libstdc++6.4.9上中断。)


以下是将UTF-32编码的宽字符转换为UTF-16的代码

//将Unicode字符串从特定于平台的“宽字符”(wchar\u t)转换为UTF-16的函数。
void convertutf32toff16(wchar\u t*源、,
const uint32_t sourceLength,
wchar_t*&目的地,
uint32(运输和目的地长度)
{
wchar_t wcharCharacter;
uint32\u t UNIUI32计数器=0;
wchar_t*pwszDestinationStart=目的地;
wchar_t*sourceStart=source;
如果(0!=目的地)
{
while(uniui32计数器<源长度)
{
wcharCharacter=*source++;
如果(wcharCharacter=0xD800&&
wcharCharacter 0x0010FFFF)
{
/*U+10FFFF是Unicode字符集的最大代码点*/
*目的地++=0x0000FFFD;
目标长度+=1;
}
其他的
{
/*源是0xFFFF-0x10FFFF范围内的字符*/
wcharCharacter-=0x0010000UL;
*目的地+=(wchar_t)((wcharCharacter>>10)+0xD800);
*目的地+=(wchar_t)((wcharCharacter&0x3FFUL)+0xDC00);
目标长度+=2;
}
++UNIUI32计数器;
}
目的地=pwszDestinationStart;
目的地[destinationLength]='\0';
}
source=sourceStart;
}//函数结束

“它失败了…”——输入、观察到的输出、预期的输出?A
main()
提供这些信息,并将此函数(加上必要的
#include
语句)包装到可编译示例中?是否将字符编码为代理项对,例如使用
u“\U0001F600”
或使用
u8的多字节字符串“\U0001F600”
,工作?我冒昧地拍了一些粗糙的
main()
,使其成为MCVE。输出是
61
e4
160
20ac
2f929
,这是我所期望的。(注意最后一个单元,它是输入中的非BMP/UTF-16代理项对。)向OP重复这个问题,inhowfar会“失败”吗"?ICU常见问题解答声称,该库支持UTF-16,而不是UCS-2,因此代理项对应该可以工作。这是否回答了如何在BMP之外编码字符的问题?好的。我看错了方向。u_strToWCS工作正常。出现问题的原因是,我需要使用CORBA将宽字符串传递给windows上的java应用程序。由于linux中的wchar__t是32位的,我需要找到一种方法将32位wchar__t转换为16位wchar_t1)应该是注释,而不是答案(因为OP指定的ICU--C++11可能不可用,或者ICU被设置为需求,我们不知道)2)在C++ 11. 3中出现了“代码转换> <代码>转换,在代码> >代码>中由C++ 17来修改;-)纠正了当添加了方面时的错误。谢谢。如果Linux C++构建环境支持LIbc++或任何最近版本的LIbSTDC++ +,它支持这些方面。无论如何,这是我在HA上遇到的示例代码。nd,我现在用ICU写一个演示已经太晚了。我明天可以回来。@DevSolar:OT,但你知道为什么它们被弃用了吗?@DevSolar更新了我的答案。@Davidslor如果你能分享你写的手动wchar\u t*到UTF-16的转换,那将很有帮助
#include <cassert>
#include <cwctype>
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <locale>
#include <string>

#if _WIN32 || _WIN64
// Windows needs a little non-standard magic for this to work.
#include <io.h>
#include <fcntl.h>
#include <locale.h>
#endif

using std::size_t;

void init_locale(void)
// Does magic so that wcout can work.
{
#if _WIN32 || _WIN64
  // Windows needs a little non-standard magic.
  constexpr char cp_utf16le[] = ".1200";
  setlocale( LC_ALL, cp_utf16le );
  _setmode( _fileno(stdout), _O_U16TEXT );
#else
  // The correct locale name may vary by OS, e.g., "en_US.utf8".
  constexpr char locale_name[] = "";
  std::locale::global(std::locale(locale_name));
  std::wcout.imbue(std::locale());
#endif
}

std::u16string make_u16string( const std::wstring& ws )
/* Creates a UTF-16 string from a wide-character string.  Any wide characters
 * outside the allowed range of UTF-16 are mapped to the sentinel value U+FFFD,
 * per the Unicode documentation. (http://www.unicode.org/faq/private_use.html
 * retrieved 12 March 2017.) Unpaired surrogates in ws are also converted to
 * sentinel values.  Noncharacters, however, are left intact.  As a fallback,
 * if wide characters are the same size as char16_t, this does a more trivial
 * construction using that implicit conversion.
 */
{
  /* We assume that, if this test passes, a wide-character string is already
   * UTF-16, or at least converts to it implicitly without needing surrogate
   * pairs.
   */
  if ( sizeof(wchar_t) == sizeof(char16_t) ) {
    return std::u16string( ws.begin(), ws.end() );
  } else {
    /* The conversion from UTF-32 to UTF-16 might possibly require surrogates.
     * A surrogate pair suffices to represent all wide characters, because all
     * characters outside the range will be mapped to the sentinel value
     * U+FFFD.  Add one character for the terminating NUL.
     */
    const size_t max_len = 2 * ws.length() + 1;
    // Our temporary UTF-16 string.
    std::u16string result;

    result.reserve(max_len);

    for ( const wchar_t& wc : ws ) {
      const std::wint_t chr = wc;

      if ( chr < 0 || chr > 0x10FFFF || (chr >= 0xD800 && chr <= 0xDFFF) ) {
        // Invalid code point.  Replace with sentinel, per Unicode standard:
        constexpr char16_t sentinel = u'\uFFFD';
        result.push_back(sentinel);
      } else if ( chr < 0x10000UL ) { // In the BMP.
        result.push_back(static_cast<char16_t>(wc));
      } else {
        const char16_t leading = static_cast<char16_t>( 
          ((chr-0x10000UL) / 0x400U) + 0xD800U );
        const char16_t trailing = static_cast<char16_t>( 
          ((chr-0x10000UL) % 0x400U) + 0xDC00U );

        result.append({leading, trailing});
      } // end if
    } // end for

   /* The returned string is shrunken to fit, which might not be the Right
    * Thing if there is more to be added to the string.
    */
    result.shrink_to_fit();

    // We depend here on the compiler to optimize the move constructor.
    return result;
  } // end if
  // Not reached.
}

int main(void)
{
  static const std::wstring wtest(L"☪☮∈✡℩☯✝ \U0001F644");
  static const std::u16string u16test(u"☪☮∈✡℩☯✝ \U0001F644");
  const std::u16string converted = make_u16string(wtest);

  init_locale();

  std::wcout << L"sizeof(wchar_t) == " << sizeof(wchar_t) << L".\n";

  for( size_t i = 0; i <= u16test.length(); ++i ) {
    if ( u16test[i] != converted[i] ) {
      std::wcout << std::hex << std::showbase
                 << std::right << std::setfill(L'0')
                 << std::setw(4) << (unsigned)converted[i] << L" ≠ "
                 << std::setw(4) << (unsigned)u16test[i] << L" at "
                 << i << L'.' << std::endl;
      return EXIT_FAILURE;
    } // end if
  } // end for

  std::wcout << wtest << std::endl;

  return EXIT_SUCCESS;
}
//Function to convert a Unicode string from platform-specific "wide characters" (wchar_t) to UTF-16.
void ConvertUTF32ToUTF16(wchar_t* source,
                         const uint32_t sourceLength,
                         wchar_t*& destination,
                         uint32_t& destinationLength)
{

  wchar_t wcharCharacter;
  uint32_t uniui32Counter = 0;

  wchar_t* pwszDestinationStart = destination;
  wchar_t* sourceStart = source;

  if(0 != destination)
  {
    while(uniui32Counter < sourceLength)
    {
      wcharCharacter = *source++;
      if(wcharCharacter <= 0x0000FFFF)
      {
        /* UTF-16 surrogate values are illegal in UTF-32
           0xFFFF or 0xFFFE are both reserved values */
        if(wcharCharacter >= 0xD800 && 
           wcharCharacter <= 0xDFFF)
        {
          *destination++ = 0x0000FFFD;
          destinationLength += 1;
        }
        else
        {
          /* source is a BMP Character */
          destinationLength += 1;
          *destination++ = wcharCharacter;
        }
      }
      else if(wcharCharacter > 0x0010FFFF)
      {
        /* U+10FFFF is the largest code point of Unicode Character Set */
        *destination++ = 0x0000FFFD;
        destinationLength += 1;
      }
      else
      {
        /* source is a character in range 0xFFFF - 0x10FFFF */
        wcharCharacter -= 0x0010000UL;
        *destination++ = (wchar_t)((wcharCharacter >> 10) + 0xD800);
        *destination++ = (wchar_t)((wcharCharacter & 0x3FFUL) + 0xDC00);
        destinationLength += 2;
      }

      ++uniui32Counter;
    }

    destination = pwszDestinationStart;
    destination[destinationLength] = '\0';
  }

  source = sourceStart;
} //function ends