如何在C中从八进制ISO-8859-1转储utf8_C_Utf 8_Character Encoding

如何在C中从八进制ISO-8859-1转储utf8

c utf-8 character-encoding

如何在C中从八进制ISO-8859-1转储utf8,c,utf-8,character-encoding,C,Utf 8,Character Encoding,我试图在utf8中输出正确的字符，给出以下八进制序列\303\255和\346\234\254，但我没有得到正确的输出 #include <stdio.h> #include <stdlib.h> int encode(char *buf, unsigned char ch){ if(ch < 0x80) { *buf++ = (char)ch; return 1; } if(ch < 0x800) {

我试图在utf8中输出正确的字符，给出以下八进制序列

\303\255

和

\346\234\254

，但我没有得到正确的输出

#include <stdio.h>
#include <stdlib.h>

int encode(char *buf, unsigned char ch){
    if(ch < 0x80) {
        *buf++ = (char)ch;
        return 1;
    }
    if(ch < 0x800) {
        *buf++ = (ch >> 6) | 0xC0;
        *buf++ = (ch & 0x3F) | 0x80;
        return 2;
    }
    if(ch < 0x10000) {
        *buf++ = (ch >> 12) | 0xE0;
        *buf++ = ((ch >> 6) & 0x3F) | 0x80;
        *buf++ = (ch & 0x3F) | 0x80;
        return 3;
    }
    if(ch < 0x110000) {
        *buf++ = (ch >> 18) | 0xF0;
        *buf++ = ((ch >> 12) & 0x3F) | 0x80;
        *buf++ = ((ch >> 6) & 0x3F) | 0x80;
        *buf++ = (ch & 0x3F) | 0x80;
        return 4;
    }
    return 0;
}

void output (char *str) {
    char *buffer = calloc(8, sizeof(char));
    int n = 0;
    while(*str) {
        n = encode(buffer + n, *str++);
    }   
    printf("%s\n", buffer);
    free (buffer);
}

int main() {
    char *str1 = "\303\255";
    char *str2 = "\346\234\254";
    output(str1);
    output(str2);   

    return 0;
}

#包括
#包括
整数编码（字符*buf，无符号字符ch）{
if（ch<0x80）{
*buf++=（char）ch；
返回1；
}
如果（通道<0x800）{
*buf++=（ch>>6）| 0xC0；
*buf++=（ch&0x3F）| 0x80；
返回2；
}
if（ch<0x10000）{
*buf++=（ch>>12）| 0xE0；
*buf++=（（ch>>6）和0x3F）| 0x80；
*buf++=（ch&0x3F）| 0x80；
返回3；
}
如果（ch<0x110000）{
*buf++=（ch>>18）| 0xF0；
*buf++=（（ch>>12）和0x3F）| 0x80；
*buf++=（（ch>>6）和0x3F）| 0x80；
*buf++=（ch&0x3F）| 0x80；
返回4；
}
返回0；
}
无效输出（字符*str）{
char*buffer=calloc（8，sizeof（char））；
int n=0；
while（*str）{
n=编码（缓冲区+n，*str++）；
}   
printf（“%s\n”，缓冲区）；
自由（缓冲）；
}
int main（）{
char*str1=“\303\255”；
char*str2=“\346\234\254”；
输出（str1）；
输出（str2）；
返回0；
}

输出：

本
无用函数参数：无符号字符ch

/// In the following bad code, `if(ch < 0x10000)` is never true
int encode(char *buf, unsigned char ch){
    if(ch < 0x80) {
      ...
      return 1;
    if(ch < 0x800) {
      ...
      return 2;
    if(ch < 0x10000) {

///在下面的错误代码中，`if（ch<0x10000）`从来都不是真的
整数编码（字符*buf，无符号字符ch）{
if（ch<0x80）{
...
返回1；
如果（通道<0x800）{
...
返回2；
if（ch<0x10000）{

对不起，GTG
注意：错误的代码无法检测高代理和低代理。
问题是您使用的代码序列已经是UTF-8
/* Both of these are already UTF-8 chars. */
char *str1 = "\303\255";
char *str2 = "\346\234\254";

因此，您的encode函数正在尝试编码一个已经编码的UTF-8，但它不应该工作
当我在启用UTF-8的终端中打印这些序列时，我看到了您希望看到的内容：
$printf“%s\n”$”\303\255
í
$printf“%s\n”$”\346\234\254
本

因此，如果您遇到新问题，您可能需要重新思考您试图完成的任务，并发布一个新问题。
很遗憾，但是您无法将字符
值（有符号

或

无符号

）与

0x100

以上的值进行比较。如果您尝试转换一个字节，则会丢失某些内容（iso-8859-1）utf-8的值。iso-8859-1字符与utf对应字符具有相同的代码值，因此转换非常简单，如下所示

首先，所有iso-8859-1字符与其UTF对应字符相同，因此第一个转换是标识：我们将iso-8859-1中的每个值转换为UTF中的相同值（看，当我说UTF y时，是指该字符的UTF代码，不使用任何编码，就像我说UTF-8时一样，它实际上是用8位字节对UTF进行编码）

范围

0x80…0xff

中的UTF值必须用两个字节编码，第一个字节使用位7和位6，模式

110000xx

为输入代码的两个最高有效位

xx

，然后是第二个字节

10xxxxxx

为

xxxxxx

六个最低有效位输入代码的（位5到0）。对于范围

0x00…0x7f

中的UTF值，使用与UTF代码相同的字节进行编码

以下函数精确地实现了这一点：

size_t iso2utf( unsigned char *buf, unsigned char iso )
{
    size_t res = 0;

    if ( iso & 0x80 ) {
        *buf++ = 0xc0 | (iso >> 6); /* the 110000xx part */
        *buf++ = 0x80 | (iso & 0x3f); /* ... and the 10xxxxxx part. */
        res += 2;
    } else {
        *buf++ = iso; /* a 0xxxxxxx character, untouched. */
        res++;
    }
    *buf = '\0';
    return res;
} /* iso2utf */

如果您想在UTF-8编码器中加入完整的UTF，可以尝试以下方法（我使用了不同的方法，因为每个UTF字符最多有7个字节——实际上没有这么多，因为目前只使用24或25位代码）：

#包括
#包括
typedef unsigned int UTF；/*如果愿意，可以使用wchar\u t*/
typedef无符号字符字节；
/*我假设UTF字符串也是以零结尾的*/
大小\u t utf\u utf8（字节*out，utf*in）
{
大小=0；
对于（；*in；in++）{
UTF c=*in；/*复制UTF值*/
/*我们正在向后构造字符串，所以最后
*我们已经妥善订购了*/
大小\u t n=0；/*此值的字符数*/
字节aux[7]，/*缓冲区来构造字符串*/
*p=aux+sizeof aux；/*点一个单元格超过端点*/
静态UTF限制[]={0x80、0x20、0x10、0x08、0x4、0x2、0x01}；
静态UTF掩码[]={0x00、0xc0、0xe0、0xf0、0xf8、0xfc、0xfe}；
对于（；c>=限值[n]；c>>=6）{
*--p=0x80 |（c&0x3f）；n++；
}/*用于*/
*--p=掩码[n]| c；n++；
memcpy（out，p，n）；out+=n；res+=n；
}/*用于*/
*out='\0'；/*终止字符串*/
返回res；
}/*utf_utf8*/

请注意，每个UTF代码的七个字节是硬连线的，因为UTF代码是32位整数。我不希望UTF代码进一步超过32位限制，但在这种情况下，UTF

typedef

，以及表

aux

、

limits

和

masks

的大小和内容可能会相应更改值得一提的是，utf-8编码使用的字符数也有7或8的最大限制，而且标准中没有以任何形式规定如何处理utf代码空间在任何时候用完的代码，因此最好不要过多地使用这些字符。

我认为您的程序做的是正确的（但我没有检查代码）.如果我看一下，我会看到

\303

=195（dez），这是一个

和一个软连字符（未打印）。或者你的意思是\303\255
应该已经是UTF-8字符了吗？这个工具告诉我你得到的也是正确的。我得到的输入是\303\255
，这是的序列。输入也会输出\303\255。这个工具不会告诉你输入的编码方式文本正在使用。它可以是ISO 8859-15、拉丁文1或a
#include <string.h>
#include <stdlib.h>

typedef unsigned int    UTF; /* you can use wchar_t if you prefer */
typedef unsigned char   BYTE;

/* I will assume that UTF string is also zero terminated */
size_t utf_utf8 (BYTE *out, UTF *in)
{
    size_t res = 0;
    for (;*in;in++) {
        UTF c = *in; /* copy the UTF value */
        /* we are constructing the string backwards, so finally
         * we have it properly ordered. */
        size_t  n = 0; /* number of characters for this one */
        BYTE    aux[7], /* buffer to construct the string */
                *p = aux + sizeof aux; /* point one cell past the end */

        static UTF limits[] = { 0x80, 0x20, 0x10, 0x08, 0x4, 0x2, 0x01};
        static UTF masks[] = { 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe};

        for (;c >= limits[n]; c >>= 6) {
            *--p = 0x80 | (c & 0x3f); n++;
        } /* for */
        *--p = masks[n] | c; n++;
        memcpy(out, p, n); out += n; res += n;
    } /* for */
    *out = '\0'; /* terminate string */
    return res;
} /* utf_utf8 */