C++ 如何轻松检测字符串中的utf8编码？_C++_Windows_String_Encoding_Utf 8

C++ 如何轻松检测字符串中的utf8编码？

c++ windows string encoding utf-8

C++ 如何轻松检测字符串中的utf8编码？,c++,windows,string,encoding,utf-8,C++,Windows,String,Encoding,Utf 8,我有一个字符串，它由来自其他程序的数据填充，这个数据可以是UTF8编码，也可以不是。如果不是，我可以编码UTF8，但是在C++中检测UTF8的最好方法是什么？我看到了这种变体，但有评论说，这种解决方案不能提供100%的检测。所以，若我对已经包含UTF8数据的UTF8字符串进行编码，那个么我会将错误的文本写入数据库所以我可以使用这个UTF8检测： bool is_utf8(const char * string) { if(!string) return 0;

我有一个字符串，它由来自其他程序的数据填充，这个数据可以是UTF8编码，也可以不是。如果不是，我可以编码UTF8，但是在C++中检测UTF8的最好方法是什么？我看到了这种变体，但有评论说，这种解决方案不能提供100%的检测。所以，若我对已经包含UTF8数据的UTF8字符串进行编码，那个么我会将错误的文本写入数据库

所以我可以使用这个UTF8检测：

bool is_utf8(const char * string)
{
    if(!string)
        return 0;

    const unsigned char * bytes = (const unsigned char *)string;
    while(*bytes)
    {
        if( (// ASCII
             // use bytes[0] <= 0x7F to allow ASCII control characters
                bytes[0] == 0x09 ||
                bytes[0] == 0x0A ||
                bytes[0] == 0x0D ||
                (0x20 <= bytes[0] && bytes[0] <= 0x7E)
            )
        ) {
            bytes += 1;
            continue;
        }

        if( (// non-overlong 2-byte
                (0xC2 <= bytes[0] && bytes[0] <= 0xDF) &&
                (0x80 <= bytes[1] && bytes[1] <= 0xBF)
            )
        ) {
            bytes += 2;
            continue;
        }

        if( (// excluding overlongs
                bytes[0] == 0xE0 &&
                (0xA0 <= bytes[1] && bytes[1] <= 0xBF) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF)
            ) ||
            (// straight 3-byte
                ((0xE1 <= bytes[0] && bytes[0] <= 0xEC) ||
                    bytes[0] == 0xEE ||
                    bytes[0] == 0xEF) &&
                (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF)
            ) ||
            (// excluding surrogates
                bytes[0] == 0xED &&
                (0x80 <= bytes[1] && bytes[1] <= 0x9F) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF)
            )
        ) {
            bytes += 3;
            continue;
        }

        if( (// planes 1-3
                bytes[0] == 0xF0 &&
                (0x90 <= bytes[1] && bytes[1] <= 0xBF) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
                (0x80 <= bytes[3] && bytes[3] <= 0xBF)
            ) ||
            (// planes 4-15
                (0xF1 <= bytes[0] && bytes[0] <= 0xF3) &&
                (0x80 <= bytes[1] && bytes[1] <= 0xBF) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
                (0x80 <= bytes[3] && bytes[3] <= 0xBF)
            ) ||
            (// plane 16
                bytes[0] == 0xF4 &&
                (0x80 <= bytes[1] && bytes[1] <= 0x8F) &&
                (0x80 <= bytes[2] && bytes[2] <= 0xBF) &&
                (0x80 <= bytes[3] && bytes[3] <= 0xBF)
            )
        ) {
            bytes += 4;
            continue;
        }

        return 0;
    }

    return 1;
}

或者上面的代码没有正确执行？我也在Windows7中这样做。那Ubuntu呢？这种变体在那里有效吗？

您可能不了解UTF-8及其替代品。一个字节只有256个可能的值。考虑到字符的数量，这并不多。因此，许多字节序列既是有效的UTF-8字符串，也是其他编码中的有效字符串

事实上，每个ASCII字符串都是一个有效的UTF-8字符串，其含义基本相同。对于

s_utf8（“Hello”）

，您的代码将返回

true

甚至许多其他非UTF8、非ASCII字符串也与有效的UTF-8字符串共享一个字节序列。如果不知道非UTF-8编码是什么类型的，就无法将非UTF-8字符串转换为UTF-8。甚至拉丁语-1和拉丁语-2也已经大不相同了

CP_ACP

甚至比拉丁语-1更糟糕，

CP_ACP

在任何地方都不一样

您的文本必须以UTF-8的形式进入数据库。因此，如果它还不是UTF-8，则必须对其进行转换，并且您必须知道确切的源编码。没有神奇的逃避

在Linux上，

iconv

是在两种编码之间转换的常用方法。

比较整个字节值不是检测UTF-8的正确方法。您必须分析每个字节的实际位模式。UTF-8使用了其他编码所不使用的非常独特的位模式。请尝试类似以下内容：

bool is_utf8(const char * string)
{
    if (!string)
        return true;

    const unsigned char * bytes = (const unsigned char *)string;
    int num;

    while (*bytes != 0x00)
    {
        if ((*bytes & 0x80) == 0x00)
        {
            // U+0000 to U+007F 
            num = 1;
        }
        else if ((*bytes & 0xE0) == 0xC0)
        {
            // U+0080 to U+07FF 
            num = 2;
        }
        else if ((*bytes & 0xF0) == 0xE0)
        {
            // U+0800 to U+FFFF 
            num = 3;
        }
        else if ((*bytes & 0xF8) == 0xF0)
        {
            // U+10000 to U+10FFFF 
            num = 4;
        }
        else
            return false;

        bytes += 1;
        for (int i = 1; i < num; ++i)
        {
            if ((*bytes & 0xC0) != 0x80)
                return false;
            bytes += 1;
        }
    }

    return true;
}

bool是\u utf8（常量字符*字符串）
{
如果（！字符串）
返回true；
常量无符号字符*字节=（常量无符号字符*）字符串；
int-num；
而（*字节！=0x00）
{
如果（（*字节&0x80）=0x00）
{
//U+0000至U+007F
num=1；
}
else if（（*字节&0xE0）==0xC0）
{
//U+0080至U+07FF
num=2；
}
else if（（*字节&0xF0）==0xE0）
{
//U+0800至U+FFFF
num=3；
}
else if（（*字节&0xF8）==0xF0）
{
//U+10000至U+10FFFF
num=4；
}
其他的
返回false；
字节+=1；
对于（int i=1；i


现在，这不考虑非法的UTF-8序列，例如超长编码、UTF-16代理和U+10FFFF以上的码点。如果您想确保UTF-8既有效又正确，则需要以下内容：
bool is_valid_utf8(const char * string)
{
    if (!string)
        return true;

    const unsigned char * bytes = (const unsigned char *)string;
    unsigned int cp;
    int num;

    while (*bytes != 0x00)
    {
        if ((*bytes & 0x80) == 0x00)
        {
            // U+0000 to U+007F 
            cp = (*bytes & 0x7F);
            num = 1;
        }
        else if ((*bytes & 0xE0) == 0xC0)
        {
            // U+0080 to U+07FF 
            cp = (*bytes & 0x1F);
            num = 2;
        }
        else if ((*bytes & 0xF0) == 0xE0)
        {
            // U+0800 to U+FFFF 
            cp = (*bytes & 0x0F);
            num = 3;
        }
        else if ((*bytes & 0xF8) == 0xF0)
        {
            // U+10000 to U+10FFFF 
            cp = (*bytes & 0x07);
            num = 4;
        }
        else
            return false;

        bytes += 1;
        for (int i = 1; i < num; ++i)
        {
            if ((*bytes & 0xC0) != 0x80)
                return false;
            cp = (cp << 6) | (*bytes & 0x3F);
            bytes += 1;
        }

        if ((cp > 0x10FFFF) ||
            ((cp >= 0xD800) && (cp <= 0xDFFF)) ||
            ((cp <= 0x007F) && (num != 1)) ||
            ((cp >= 0x0080) && (cp <= 0x07FF) && (num != 2)) ||
            ((cp >= 0x0800) && (cp <= 0xFFFF) && (num != 3)) ||
            ((cp >= 0x10000) && (cp <= 0x1FFFFF) && (num != 4)))
            return false;
    }

    return true;
}

bool是有效的\u utf8（常量字符*字符串）
{
如果（！字符串）
返回true；
常量无符号字符*字节=（常量无符号字符*）字符串；
无符号整数cp；
int-num；
而（*字节！=0x00）
{
如果（（*字节&0x80）=0x00）
{
//U+0000至U+007F
cp=（*字节和0x7F）；
num=1；
}
else if（（*字节&0xE0）==0xC0）
{
//U+0080至U+07FF
cp=（*字节和0x1F）；
num=2；
}
else if（（*字节&0xF0）==0xE0）
{
//U+0800至U+FFFF
cp=（*字节和0x0F）；
num=3；
}
else if（（*字节&0xF8）==0xF0）
{
//U+10000至U+10FFFF
cp=（*字节&0x07）；
num=4；
}
其他的
返回false；
字节+=1；
对于（int i=1；i（（cp>=0xD800）和&（cp how（*字节和0xE0）==0xC0给出从0x80到0x7ff的范围…？）它应该给出从0xC0到0xdf@ahmedallam不，我写的是正确的。看看维基百科上描述的UTF-8。Unicode代码点U+0080到U+07FF（不是字节0xC0到0xDF）使用位模式110xxxxx 10xxxxxx
以2个字节进行编码。0xE0是位11100000
，0xC0是位11000000
。因此，如果（*bytes&0xE0）==0xC0）
在（*bytes&0x1F）
获取低5位之前检查第一个字节的高3位是否为110
。然后，(（*bytes&0xC0）！=0x80）
检查第二个字节的高2位在之前是否为10
（*bytes&0x3F）
抓取低6位。@ahmedallam似乎你需要重温一下位、位掩码和位运算符的工作方式。@Remylebau这个异常/线程安全吗？（noob问题）@NorbertBoros只要字符串参数指向一个有效的C样式空终止字符串，并且在函数运行时另一个线程没有修改或释放内存，那么是的，函数是安全的。否则，它的行为是未定义的。
bool is_valid_utf8(const char * string)
{
    if (!string)
        return true;

    const unsigned char * bytes = (const unsigned char *)string;
    unsigned int cp;
    int num;

    while (*bytes != 0x00)
    {
        if ((*bytes & 0x80) == 0x00)
        {
            // U+0000 to U+007F 
            cp = (*bytes & 0x7F);
            num = 1;
        }
        else if ((*bytes & 0xE0) == 0xC0)
        {
            // U+0080 to U+07FF 
            cp = (*bytes & 0x1F);
            num = 2;
        }
        else if ((*bytes & 0xF0) == 0xE0)
        {
            // U+0800 to U+FFFF 
            cp = (*bytes & 0x0F);
            num = 3;
        }
        else if ((*bytes & 0xF8) == 0xF0)
        {
            // U+10000 to U+10FFFF 
            cp = (*bytes & 0x07);
            num = 4;
        }
        else
            return false;

        bytes += 1;
        for (int i = 1; i < num; ++i)
        {
            if ((*bytes & 0xC0) != 0x80)
                return false;
            cp = (cp << 6) | (*bytes & 0x3F);
            bytes += 1;
        }

        if ((cp > 0x10FFFF) ||
            ((cp >= 0xD800) && (cp <= 0xDFFF)) ||
            ((cp <= 0x007F) && (num != 1)) ||
            ((cp >= 0x0080) && (cp <= 0x07FF) && (num != 2)) ||
            ((cp >= 0x0800) && (cp <= 0xFFFF) && (num != 3)) ||
            ((cp >= 0x10000) && (cp <= 0x1FFFFF) && (num != 4)))
            return false;
    }

    return true;
}