Unicode 适用于UTF8到1252的Windows C API_Unicode_Utf 8_Cp1252_Windows 1252

Unicode 适用于UTF8到1252的Windows C API

unicode utf-8

Unicode 适用于UTF8到1252的Windows C API,unicode,utf-8,cp1252,windows-1252,Unicode,Utf 8,Cp1252,Windows 1252,我熟悉WideChartToMultiByte和MultiByteToWideChar转换，可以使用这些转换执行以下操作： UTF8->UTF16->1252 我知道iconv会做我需要的事情，但是有人知道任何MS LIB可以在一次呼叫中实现这一点吗我可能应该把iconv库放进去，但我觉得很懒多亏了Windows 1252在很大程度上等同于拉丁语-1，也称为ISO-8859-1:Windows-1252只是在拉丁语-1保留范围128-159中分配了一些额外的字符。如果您准备忽略这些额外的字符

我熟悉WideChartToMultiByte和MultiByteToWideChar转换，可以使用这些转换执行以下操作：

UTF8->UTF16->1252

我知道iconv会做我需要的事情，但是有人知道任何MS LIB可以在一次呼叫中实现这一点吗

我可能应该把iconv库放进去，但我觉得很懒

多亏了Windows 1252在很大程度上等同于拉丁语-1，也称为ISO-8859-1:Windows-1252只是在拉丁语-1保留范围128-159中分配了一些额外的字符。如果您准备忽略这些额外的字符，并坚持使用拉丁语-1，那么转换就相当容易了。试试这个：

#include <stddef.h>

/*  
 * Convert from UTF-8 to latin-1. Invalid encodings, and encodings of
 * code points beyond 255, are replaced by question marks. No more than
 * dst_max_len bytes are stored in the destination array. Returned value
 * is the length that the latin-1 string would have had, assuming a big
 * enough destination buffer.
 */
size_t
utf8_to_latin1(char *src, size_t src_len,
    char *dst, size_t dst_max_len)
{   
    unsigned char *sb;
    size_t u, v;

    u = v = 0;
    sb = (unsigned char *)src;
    while (u < src_len) {
        int c = sb[u ++];
        if (c >= 0x80) {
            if (c >= 0xC0 && c < 0xE0) {
                if (u == src_len) {
                    c = '?';
                } else {
                    int w = sb[u];
                    if (w >= 0x80 && w < 0xC0) {
                        u ++;
                        c = ((c & 0x1F) << 6)
                            + (w & 0x3F);
                    } else {
                        c = '?';
                    }   
                }   
            } else {
                int i;

                for (i = 6; i >= 0; i --)
                    if (!(c & (1 << i)))
                        break;
                c = '?';
                u += i;
            }   
        }   
        if (v < dst_max_len)
            dst[v] = (char)c;
        v ++;
    }   
    return v;
}   

/*  
 * Convert from latin-1 to UTF-8. No more than dst_max_len bytes are
 * stored in the destination array. Returned value is the length that
 * the UTF-8 string would have had, assuming a big enough destination
 * buffer.
 */
size_t
latin1_to_utf8(char *src, size_t src_len,
    char *dst, size_t dst_max_len)
{   
    unsigned char *sb;
    size_t u, v;

    u = v = 0;
    sb = (unsigned char *)src;
    while (u < src_len) {
        int c = sb[u ++];
        if (c < 0x80) {
            if (v < dst_max_len)
                dst[v] = (char)c;
            v ++;
        } else {
            int h = 0xC0 + (c >> 6);
            int l = 0x80 + (c & 0x3F);
            if (v < dst_max_len) {
                dst[v] = (char)h;
                if ((v + 1) < dst_max_len)
                    dst[v + 1] = (char)l;
            }   
            v += 2;
        }   
    }   
    return v;
}

#包括
/*  
*将UTF-8转换为拉丁语-1。无效的编码，以及
*超过255的代码点将替换为问号。不过是
*dst_max_len字节存储在目标阵列中。返回值
*是拉丁-1字符串的长度，假设一个大的
*足够的目标缓冲区。
*/
尺寸
utf8到拉丁语（字符*src，大小，
字符*dst，大小（最大长度）
{   
未签名字符*sb；
尺寸_tu，v；
u=v=0；
sb=（无符号字符*）src；
而（u=0x80）{
如果（c>=0xC0&&c<0xE0）{
如果（u==src_len）{
c='？'；
}否则{
int w=sb[u]；
如果（w>=0x80&&w<0xC0）{
u++；
c=（（c&0x1F）=0；i--）
如果（！（c&（1>6）；
int l=0x80+（c&0x3F）；
如果（v


请注意，我对这段代码不作任何保证。这是完全未经测试的。
Windows 1252基本上等同于拉丁语-1，也称为ISO-8859-1:Windows-1252只是在拉丁语-1保留范围128-159中分配了一些额外字符。如果您准备忽略这些额外字符，并坚持使用拉丁语-1，则转换为n相当简单。请尝试以下方法：
#include <stddef.h>

/*  
 * Convert from UTF-8 to latin-1. Invalid encodings, and encodings of
 * code points beyond 255, are replaced by question marks. No more than
 * dst_max_len bytes are stored in the destination array. Returned value
 * is the length that the latin-1 string would have had, assuming a big
 * enough destination buffer.
 */
size_t
utf8_to_latin1(char *src, size_t src_len,
    char *dst, size_t dst_max_len)
{   
    unsigned char *sb;
    size_t u, v;

    u = v = 0;
    sb = (unsigned char *)src;
    while (u < src_len) {
        int c = sb[u ++];
        if (c >= 0x80) {
            if (c >= 0xC0 && c < 0xE0) {
                if (u == src_len) {
                    c = '?';
                } else {
                    int w = sb[u];
                    if (w >= 0x80 && w < 0xC0) {
                        u ++;
                        c = ((c & 0x1F) << 6)
                            + (w & 0x3F);
                    } else {
                        c = '?';
                    }   
                }   
            } else {
                int i;

                for (i = 6; i >= 0; i --)
                    if (!(c & (1 << i)))
                        break;
                c = '?';
                u += i;
            }   
        }   
        if (v < dst_max_len)
            dst[v] = (char)c;
        v ++;
    }   
    return v;
}   

/*  
 * Convert from latin-1 to UTF-8. No more than dst_max_len bytes are
 * stored in the destination array. Returned value is the length that
 * the UTF-8 string would have had, assuming a big enough destination
 * buffer.
 */
size_t
latin1_to_utf8(char *src, size_t src_len,
    char *dst, size_t dst_max_len)
{   
    unsigned char *sb;
    size_t u, v;

    u = v = 0;
    sb = (unsigned char *)src;
    while (u < src_len) {
        int c = sb[u ++];
        if (c < 0x80) {
            if (v < dst_max_len)
                dst[v] = (char)c;
            v ++;
        } else {
            int h = 0xC0 + (c >> 6);
            int l = 0x80 + (c & 0x3F);
            if (v < dst_max_len) {
                dst[v] = (char)h;
                if ((v + 1) < dst_max_len)
                    dst[v + 1] = (char)l;
            }   
            v += 2;
        }   
    }   
    return v;
}   

#包括
/*  
*从UTF-8转换为拉丁语-1。编码无效，并且
*超过255的代码点将替换为问号。不超过
*dst_max_len字节存储在目标数组中。返回值
*是拉丁-1字符串的长度，假设一个大的
*足够的目标缓冲区。
*/
尺寸
utf8到拉丁语（字符*src，大小，
字符*dst，大小（最大长度）
{   
未签名字符*sb；
尺寸_tu，v；
u=v=0；
sb=（无符号字符*）src；
而（u=0x80）{
如果（c>=0xC0&&c<0xE0）{
如果（u==src_len）{
c='？'；
}否则{
int w=sb[u]；
如果（w>=0x80&&w<0xC0）{
u++；
c=（（c&0x1F）=0；i--）
如果（！（c&（1>6）；
int l=0x80+（c&0x3F）；
如果（v

请注意，我对这段代码不作任何保证。这是完全未经测试的。
这比只使用测试过的API调用MultiByteToWideChar后跟WideCharToMultiByte更好吗？“更好”这是一个多么有争议的概念。问题是如何在一个函数调用而不是两个函数调用中实现它，特别是如果这两个函数是MultiByteToWideChar（）
和WideCharToMultiByte（）
。我的代码就是这样做的。我并不是说避免MultiByteToWideChar（）是明智的
。挑剔的是，有人可能会指出我的函数避免了临时缓冲区的分配。这比仅仅通过调用MultiByteToWideChar然后调用WideCharToMultiByte来使用经过测试的API更好吗？“更好”这是一个多么有争议的概念。问题是如何在一个函数调用而不是两个函数调用中实现它，特别是如果这两个函数是MultiByteToWideChar（）
和WideCharToMultiByte（）
。我的代码就是这样做的。我并不是说避免MultiByteToWideChar（）是明智的
。有人会挑剔地指出，我的函数避免了临时缓冲区的分配。