C# 如何解码编码为UTF8的代理字符？_C#_C++_Unicode_Utf 8_Surrogate Pairs

C# 如何解码编码为UTF8的代理字符？

c# c++ unicode utf-8

C# 如何解码编码为UTF8的代理字符？,c#,c++,unicode,utf-8,surrogate-pairs,C#,C++,Unicode,Utf 8,Surrogate Pairs,我的C#程序获取一些UTF-8编码的数据，并使用Encoding.UTF8.GetString（data）对其进行解码。当生成数据的程序获取BMP之外的字符时，它将它们编码为2个代理字符，每个字符分别编码为UTF-8。在这种情况下，我的程序无法正确解码它们如何在C#中解码此类数据示例： static void Main（字符串[]args） { String Org=“没有一个好的东西是不可能确定的。但是我觉得你好像在C++中使用了错误的转换器。区域设置从UCS-2而不是UTF-16转换。

我的C#程序获取一些UTF-8编码的数据，并使用

Encoding.UTF8.GetString（data）

对其进行解码。当生成数据的程序获取BMP之外的字符时，它将它们编码为2个代理字符，每个字符分别编码为UTF-8。在这种情况下，我的程序无法正确解码它们

如何在C#中解码此类数据

示例：

static void Main（字符串[]args）
{
String Org=“没有一个好的东西是不可能确定的。但是我觉得你好像在C++中使用了错误的转换器。
区域设置从UCS-2而不是UTF-16转换。两者非常相似，但UCS-2不支持对要编码的字符进行编码所需的代理项对
相反，您应该使用：
std:：wstring\u转换UTF8转换器；
std:：string utf8str=utf8Converter.to_字节（wstr）；

当我使用该转换器时，我得到了所需的UTF-8字节：F0 9F 8C 8E
。当然，当这些字节被解释为UTF-8时，它们在.NET中可以正确解码


附录：
该问题已更新，表明编码代码无法更改。您使用的UCS-2已被编码为无效UTF8。由于UTF8无效，您必须自己解码文本
<>我看到了一些合理的方法来做这件事。首先，编写一个不关心UTF8是否包含无效字节序列的解码器。第二，使用C++ <代码> STD:：WSTRIGIONTURGEX/Ung>转换器来解码字节（例如，用C++编写你的接收代码，或者写一个C++的DLL，你可以从C代码中调用它来完成这项工作）。
第二个选项在某种意义上是更可靠的，即，你首先使用的是创建坏数据的解码器。另一方面，即使创建DLL也可能是过份的，也不要介意在C++中写入整个客户端。在DLL上，即使使用C++ +CLI，仍然有一些头痛使互操作正确工作，unle。ss你已经是专家了
我对C++/CLI很熟悉，但几乎不是专家。我对C#的使用要好得多，所以这里有一些关于第一个选项的代码：
private const int _khighOffset = 0xD800 - (0x10000 >> 10);

/// <summary>
/// Decodes a nominally UTF8 byte sequence as UTF16. Ignores all data errors
/// except those which prevent coherent interpretation of the input data.
/// Input with invalid-but-decodable UTF8 sequences will be decoded without
/// error, and may lead to invalid UTF16.
/// </summary>
/// <param name="bytes">The UTF8 byte sequence to decode</param>
/// <returns>A string value representing the decoded UTF8</returns>
/// <remarks>
/// This method has not been thoroughly validated. It should be tested
/// carefully with a broad range of inputs (the entire UTF16 code point
/// range would not be unreasonable) before being used in any sort of
/// production environment.
/// </remarks>
private static string DecodeUtf8WithOverlong(byte[] bytes)
{
    List<char> result = new List<char>();
    int continuationCount = 0, continuationAccumulator = 0, highBase = 0;
    char continuationBase = '\0';

    for (int i = 0; i < bytes.Length; i++)
    {
        byte b = bytes[i];

        if (b < 0x80)
        {
            result.Add((char)b);
            continue;
        }

        if (b < 0xC0)
        {
            // Byte values in this range are used only as continuation bytes.
            // If we aren't expecting any continuation bytes, then the input
            // is invalid beyond repair.
            if (continuationCount == 0)
            {
                throw new ArgumentException("invalid encoding");
            }

            // Each continuation byte represents 6 bits of the actual
            // character value
            continuationAccumulator <<= 6;
            continuationAccumulator |= (b - 0x80);
            if (--continuationCount == 0)
            {
                continuationAccumulator += highBase;

                if (continuationAccumulator > 0xffff)
                {
                    // Code point requires more than 16 bits, so split into surrogate pair
                    char highSurrogate = (char)(_khighOffset + (continuationAccumulator >> 10)),
                        lowSurrogate = (char)(0xDC00 + (continuationAccumulator & 0x3FF));

                    result.Add(highSurrogate);
                    result.Add(lowSurrogate);
                }
                else
                {
                    result.Add((char)(continuationBase | continuationAccumulator));
                }
                continuationAccumulator = 0;
                continuationBase = '\0';
                highBase = 0;
            }
            continue;
        }

        if (b < 0xE0)
        {
            continuationCount = 1;
            continuationBase = (char)((b - 0xC0) * 0x0040);
            continue;
        }

        if (b < 0xF0)
        {
            continuationCount = 2;
            continuationBase = (char)(b == 0xE0 ? 0x0800 : (b - 0xE0) * 0x1000);
            continue;
        }

        if (b < 0xF8)
        {
            continuationCount = 3;
            highBase = (b - 0xF0) * 0x00040000;
            continue;
        }

        if (b < 0xFC)
        {
            continuationCount = 4;
            highBase = (b - 0xF8) * 0x01000000;
            continue;
        }

        if (b < 0xFE)
        {
            continuationCount = 5;
            highBase = (b - 0xFC) * 0x40000000;
            continue;
        }

        // byte values of 0xFE and 0xFF are invalid
        throw new ArgumentException("invalid encoding");
    }

    return new string(result.ToArray());
}

private const int\u khighOffset=0xD800-（0x10000>>10）；
/// 
///将名义上的UTF8字节序列解码为UTF16。忽略所有数据错误
///除了那些妨碍对输入数据进行连贯解释的因素。
///带有无效但可解码UTF8序列的输入将被解码，无需
///错误，并可能导致UTF16无效。
/// 
///要解码的UTF8字节序列
///表示已解码UTF8的字符串值
/// 
///此方法尚未得到彻底验证。应进行测试
///小心地使用广泛的输入（整个UTF16代码点
///范围不会不合理），然后才用于任何类型的
///生产环境。
/// 
专用静态字符串DecodeUtf8WithOverlong（字节[]字节）
{
列表结果=新列表（）；
int continuationCount=0，continuationacculator=0，highBase=0；
字符continuationBase='\0'；
for（int i=0；i10），
lowsrogate=（char）（0xDC00+（continuationacculator&0x3FF））；
结果。添加（高级代理）；
结果。添加（低替代项）；
}
其他的
{
结果.添加（（字符）（continuationBase | continuationacculator））；
}
连续累加器=0；
continuationBase='\0'；
高基=0；
}
继续；
}
if（b<0xE0）
{
continuationCount=1；
continuationBase=（char）（（b-0xC0）*0x0040）；
继续；
}
if（b<0xF0）
{
连续计数=2；
continuationBase=（char）（b==0xE0？0x0800:（b-0xE0）*0x1000）；
继续；
}
如果（b<0xF8）
{
连续计数=3；
高基=（b-0xF0）*0x00040000；
继续；
}
如果（b<0xFC）
{
连续计数=4；
高基=（b-0xF8）*0x01000000；
继续；
}
if（b<0xFE）
{
连续计数=5；
高基=（b-0xFC）*0x40000000；
继续；
}
//0xFE和0xFF的字节值无效
抛出新ArgumentException（“无效编码”）；
}
返回新字符串（result.ToArray（））；
}

我用你的globe字符测试了它，效果很好。它还正确地解码了该字符的正确UTF8（即F0 9F 8C 8E
）。如果您打算使用该代码对所有UTF8输入进行解码，您当然会希望使用完整的数据范围对其进行测试。
我得到字符的0xD83C 0xDF0E
，而不是您声称的0xD83D 0xDF0E
。此外，如果我使用.NET将该字符编码为UTF8，我得到F0 9F 8C 8E，而不是ED A0 ED BC8E
如您所说。最后，当我将F0 9F 8C 8E
解码回C#字符串时，我得到了“代理代码点不能用UTF-8（或任何UTF）编码，因此编码.UTF8.GetString正确地替换了无效的byt
private const int _khighOffset = 0xD800 - (0x10000 >> 10);

/// <summary>
/// Decodes a nominally UTF8 byte sequence as UTF16. Ignores all data errors
/// except those which prevent coherent interpretation of the input data.
/// Input with invalid-but-decodable UTF8 sequences will be decoded without
/// error, and may lead to invalid UTF16.
/// </summary>
/// <param name="bytes">The UTF8 byte sequence to decode</param>
/// <returns>A string value representing the decoded UTF8</returns>
/// <remarks>
/// This method has not been thoroughly validated. It should be tested
/// carefully with a broad range of inputs (the entire UTF16 code point
/// range would not be unreasonable) before being used in any sort of
/// production environment.
/// </remarks>
private static string DecodeUtf8WithOverlong(byte[] bytes)
{
    List<char> result = new List<char>();
    int continuationCount = 0, continuationAccumulator = 0, highBase = 0;
    char continuationBase = '\0';

    for (int i = 0; i < bytes.Length; i++)
    {
        byte b = bytes[i];

        if (b < 0x80)
        {
            result.Add((char)b);
            continue;
        }

        if (b < 0xC0)
        {
            // Byte values in this range are used only as continuation bytes.
            // If we aren't expecting any continuation bytes, then the input
            // is invalid beyond repair.
            if (continuationCount == 0)
            {
                throw new ArgumentException("invalid encoding");
            }

            // Each continuation byte represents 6 bits of the actual
            // character value
            continuationAccumulator <<= 6;
            continuationAccumulator |= (b - 0x80);
            if (--continuationCount == 0)
            {
                continuationAccumulator += highBase;

                if (continuationAccumulator > 0xffff)
                {
                    // Code point requires more than 16 bits, so split into surrogate pair
                    char highSurrogate = (char)(_khighOffset + (continuationAccumulator >> 10)),
                        lowSurrogate = (char)(0xDC00 + (continuationAccumulator & 0x3FF));

                    result.Add(highSurrogate);
                    result.Add(lowSurrogate);
                }
                else
                {
                    result.Add((char)(continuationBase | continuationAccumulator));
                }
                continuationAccumulator = 0;
                continuationBase = '\0';
                highBase = 0;
            }
            continue;
        }

        if (b < 0xE0)
        {
            continuationCount = 1;
            continuationBase = (char)((b - 0xC0) * 0x0040);
            continue;
        }

        if (b < 0xF0)
        {
            continuationCount = 2;
            continuationBase = (char)(b == 0xE0 ? 0x0800 : (b - 0xE0) * 0x1000);
            continue;
        }

        if (b < 0xF8)
        {
            continuationCount = 3;
            highBase = (b - 0xF0) * 0x00040000;
            continue;
        }

        if (b < 0xFC)
        {
            continuationCount = 4;
            highBase = (b - 0xF8) * 0x01000000;
            continue;
        }

        if (b < 0xFE)
        {
            continuationCount = 5;
            highBase = (b - 0xFC) * 0x40000000;
            continue;
        }

        // byte values of 0xFE and 0xFF are invalid
        throw new ArgumentException("invalid encoding");
    }

    return new string(result.ToArray());
}