C++ 在C+中从UTF-8转换为ISO8859-15+;
我想在C/C++中完成从UTF-8到ISO 8859-15的转换,而不包括额外的库 我怎样才能做到这一点 我发现以下代码适用于ISO 8859-1,但我不确定如何处理ISO 8859-15和ISO 8859-1()之间的差异:C++ 在C+中从UTF-8转换为ISO8859-15+;,c++,string,encoding,utf-8,iso-8859-15,C++,String,Encoding,Utf 8,Iso 8859 15,我想在C/C++中完成从UTF-8到ISO 8859-15的转换,而不包括额外的库 我怎样才能做到这一点 我发现以下代码适用于ISO 8859-1,但我不确定如何处理ISO 8859-15和ISO 8859-1()之间的差异: std::字符串UTF8toISO8859_1(常量字符*in){ std::字符串输出; if(in==NULL) 返回; 无符号整数码点; 而(*in!=0){ 无符号字符ch=静态_转换(*in); if(ch我喜欢这段代码。它惊人地短。大部分代码只处理将多字节序列
std::字符串UTF8toISO8859_1(常量字符*in){
std::字符串输出;
if(in==NULL)
返回;
无符号整数码点;
而(*in!=0){
无符号字符ch=静态_转换(*in);
if(ch我喜欢这段代码。它惊人地短。大部分代码只处理将多字节序列解码为码点。一旦一个码点被解码,转换为ISO-8859-1非常简单:
- 如果小于或等于255,它也是一个有效的ISO-8859-1字符:
out.append(1,static_cast(codepoint));
- 如果不是,则不能在ISO-8859-1中表示,并用问号替换:
out.append(“?”);
因此,为了使其适用于ISO-8859-15,需要更多的代码来处理引入ISO-8859-15时已替换的字符(请参阅)。不幸的是,它大大增加了代码大小
下面的代码应该很容易理解。如果这是一个主要问题,可以对其进行优化以获得更好的性能
std::string UTF8toISO8859_1(const char * in) {
std::string out;
if (in == NULL)
return out;
unsigned int codepoint;
while (*in != 0) {
unsigned char ch = static_cast<unsigned char>(*in);
if (ch <= 0x7f)
codepoint = ch;
else if (ch <= 0xbf)
codepoint = (codepoint << 6) | (ch & 0x3f);
else if (ch <= 0xdf)
codepoint = ch & 0x1f;
else if (ch <= 0xef)
codepoint = ch & 0x0f;
else
codepoint = ch & 0x07;
++in;
if (((*in & 0xc0) != 0x80) && (codepoint <= 0x10ffff)) {
// a valid codepoint has been decoded; convert it to ISO-8859-15
char outc;
if (codepoint <= 255) {
// codepoints up to 255 can be directly converted wit a few exceptions
if (codepoint != 0xa4 && codepoint != 0xa6 && codepoint != 0xa8
&& codepoint != 0xb4 && codepoint != 0xb8 && codepoint != 0xbc
&& codepoint != 0xbd && codepoint != 0xbe) {
outc = static_cast<char>(codepoint);
}
else {
outc = '?';
}
}
else {
// With a few exceptions, codepoints above 255 cannot be converted
if (codepoint == 0x20AC) {
outc = 0xa4;
}
else if (codepoint == 0x0160) {
outc = 0xa6;
}
else if (codepoint == 0x0161) {
outc = 0xa8;
}
else if (codepoint == 0x017d) {
outc = 0xb4;
}
else if (codepoint == 0x017e) {
outc = 0xb8;
}
else if (codepoint == 0x0152) {
outc = 0xbc;
}
else if (codepoint == 0x0153) {
outc = 0xbd;
}
else if (codepoint == 0x0178) {
outc = 0xbe;
}
else {
outc = '?';
}
}
out.append(1, outc);
}
}
return out;
}
std::字符串UTF8toISO8859_1(常量字符*in){
std::字符串输出;
if(in==NULL)
返回;
无符号整数码点;
而(*in!=0){
无符号字符ch=静态_转换(*in);
如果(ch),这可能有帮助吗?
std::string UTF8toISO8859_1(const char * in) {
std::string out;
if (in == NULL)
return out;
unsigned int codepoint;
while (*in != 0) {
unsigned char ch = static_cast<unsigned char>(*in);
if (ch <= 0x7f)
codepoint = ch;
else if (ch <= 0xbf)
codepoint = (codepoint << 6) | (ch & 0x3f);
else if (ch <= 0xdf)
codepoint = ch & 0x1f;
else if (ch <= 0xef)
codepoint = ch & 0x0f;
else
codepoint = ch & 0x07;
++in;
if (((*in & 0xc0) != 0x80) && (codepoint <= 0x10ffff)) {
// a valid codepoint has been decoded; convert it to ISO-8859-15
char outc;
if (codepoint <= 255) {
// codepoints up to 255 can be directly converted wit a few exceptions
if (codepoint != 0xa4 && codepoint != 0xa6 && codepoint != 0xa8
&& codepoint != 0xb4 && codepoint != 0xb8 && codepoint != 0xbc
&& codepoint != 0xbd && codepoint != 0xbe) {
outc = static_cast<char>(codepoint);
}
else {
outc = '?';
}
}
else {
// With a few exceptions, codepoints above 255 cannot be converted
if (codepoint == 0x20AC) {
outc = 0xa4;
}
else if (codepoint == 0x0160) {
outc = 0xa6;
}
else if (codepoint == 0x0161) {
outc = 0xa8;
}
else if (codepoint == 0x017d) {
outc = 0xb4;
}
else if (codepoint == 0x017e) {
outc = 0xb8;
}
else if (codepoint == 0x0152) {
outc = 0xbc;
}
else if (codepoint == 0x0153) {
outc = 0xbd;
}
else if (codepoint == 0x0178) {
outc = 0xbe;
}
else {
outc = '?';
}
}
out.append(1, outc);
}
}
return out;
}