如何在适当的位置反转UTF-8字符串？_Utf 8_String_Algorithm_Performance

如何在适当的位置反转UTF-8字符串？

utf-8 string algorithm performance

如何在适当的位置反转UTF-8字符串？,utf-8,string,algorithm,performance,Utf 8,String,Algorithm,Performance,最近，有人问起一个问题。大多数建议的解决方案在处理非单字节字符串时都有问题。所以，我想知道什么是专门处理utf-8字符串的好算法我想出了一些代码，我将其作为答案发布，但我很高兴看到其他人的想法或建议。我更喜欢使用实际的代码，所以我选择了C#，因为它似乎是这个网站上最流行的语言之一，但我不介意您的代码是否使用另一种语言，只要熟悉命令式语言的人能够合理地理解它。而且，由于这是为了了解如何在低级别（低级别我只是指处理字节）实现这样的算法，因此我们的想法是避免将库用于核心代码注意事项：我对算法本身

最近，有人问起一个问题。大多数建议的解决方案在处理非单字节字符串时都有问题。所以，我想知道什么是专门处理utf-8字符串的好算法

我想出了一些代码，我将其作为答案发布，但我很高兴看到其他人的想法或建议。我更喜欢使用实际的代码，所以我选择了C#，因为它似乎是这个网站上最流行的语言之一，但我不介意您的代码是否使用另一种语言，只要熟悉命令式语言的人能够合理地理解它。而且，由于这是为了了解如何在低级别（低级别我只是指处理字节）实现这样的算法，因此我们的想法是避免将库用于核心代码

注意事项：

我对算法本身、它的性能以及如何进行优化感兴趣（我指的是算法级优化，不是用++I之类的东西取代I++；我也对实际的基准测试不感兴趣）

我并不打算在生产代码或“重新发明轮子”中实际使用它。这只是出于好奇，也是一种练习

我使用的是C#字节数组，因此我假设您可以获得字符串的长度，而无需在找到NUL之前遍历字符串。也就是说，我没有考虑查找字符串长度的复杂性。但是，如果您使用的是C，那么您可以在调用核心代码之前使用strlen（）将其考虑在内

编辑：

正如Mike F所指出的，我的代码（以及其他人在这里发布的代码）并不是在处理复合字符。关于这些的一些信息。我不熟悉这个概念，但如果这意味着存在“组合字符”，即仅在与其他“基本”字符/代码点组合时有效的字符/代码点，则可以使用此类字符的查找表来保留“全局”字符（“基本”+“组合”字符）的顺序当反转时。

我将进行一次反转字节的过程，然后进行第二次反转，将任何多字节字符（在UTF8中很容易检测到）中的字节反转回正确的顺序

你完全可以在一个关卡内处理这个问题，但我不会费心，除非例行程序成为瓶颈。

我最初的方法可以总结如下：

1）天真地反转字节

2）向后运行字符串并在运行时修复utf8序列

在第二步中处理非法序列，在第一步中，我们检查字符串是否“同步”（即，它是否以合法的前导字节开头）

编辑：改进了对Reverse（）中前导字节的验证

类UTF8Utils{
公共静态无效反向（字节[]str）{
int len=str.长度；
int i=0；
int j=len-1；
//首先，检查字符串是否已“同步”，即它是否已启动
//具有有效的前导字符。将检查是否非法
//稍后，序列将遍历整个字符串。
字节leadChar=str[0]；
//如果它以10xx xxx开头，它是一个尾随字符。。。
//如果以1111 10xx或1111 110x开头
//它超出了4个字节的范围。
//编辑：添加了7字节seq和0xff的验证
如果（（leadChar&0xc0）==0x80||
（leadChar&0xfc）==0xf8||
（leadChar&0xfe）==0xfc||
（leadChar&0xff）==0xfe||
leadChar==0xff）{
抛出新异常（“非法UTF-8序列”）；
}
//天真地将字节反转到位
而（i=0）{
//因为假定未反转缓冲区中的第一个字节为
//该字节的前导字符，可以安全地假定
//最后一个字节现在是前导字符
//不是不同步--我们已经检查过了）
leadChar=str[i]；
//检查此序列需要多少字节，并根据
//非法序列
如果（leadChar<0x80）{
n字节=1；
}else if（（leadChar&0xe0）=0xc0）{
如果（（str[i-1]&0xc0）！=0x80）{
抛出新异常（“非法UTF-8序列”）；
}
n字节=2；
}else if（（leadChar&0xf0）=0xe0）{
如果（（str[i-1]&0xc0）！=0x80||
（str[i-2]&0xc0）！=0x80）{
抛出新异常（“非法UTF-8序列”）；
}
n字节=3；
}else if（（leadChar&0xf8）=0xf0）{
如果（（str[i-1]&0xc0）！=0x80||
（str[i-2]&0xc0）！=0x80||
（str[i-3]&0xc0）！=0x80）{
抛出新异常（“非法UTF-8序列”）；
}
n字节=4；
}否则{
抛出新异常（“非法UTF-8序列”）；
}
//现在，颠倒当前顺序，然后继续
//下一个在哪里
int back=i；
int front=back-n字节+1；
while（前<后）{
字节tmp=str[front]；
str[前]=str[后]；
str[back]=tmp；
前端++；
背--；
}
i-=nBytes；
}
}
}
class UTF8Utils {


    public static void Reverse(byte[] str) {
        int len = str.Length;
        int i   = 0;
        int j   = len - 1;

        //  first, check if the string is "synced", i.e., it starts
        //  with a valid leading character. Will check for illegal 
        //  sequences thru the whole string later.
        byte leadChar = str[0];

        //  if it starts with 10xx xxx, it's a trailing char...
        //  if it starts with 1111 10xx or 1111 110x 
        //  it's out of the 4 bytes range.
    //  EDIT: added validation for 7 bytes seq and 0xff
        if( (leadChar & 0xc0) == 0x80 ||
            (leadChar & 0xfc) == 0xf8 ||
            (leadChar & 0xfe) == 0xfc ||
        (leadChar & 0xff) == 0xfe ||
        leadChar == 0xff) {

            throw new Exception("Illegal UTF-8 sequence");

        }

        //  reverse bytes in-place naïvely
        while(i < j) {
            byte tmp = str[i];
            str[i]  = str[j];
            str[j]  = tmp;
            i++;
            j--;
        }
        //  now, run the string again to fix the multibyte sequences
        UTF8Utils.ReverseMbSequences(str);

    }

    private static void ReverseMbSequences(byte[] str) {
        int i = str.Length - 1;
        byte leadChar = 0;
        int nBytes  = 0;

        //  loop backwards thru the reversed buffer
        while(i >= 0) {
            //  since the first byte in the unreversed buffer is assumed to be
            //  the leading char of that byte, it seems safe to assume that the  
            //  last byte is now the leading char. (Given that the string is
            //  not out of sync -- we checked that out already)
            leadChar = str[i];

            //  check how many bytes this sequence takes and validate against
            //  illegal sequences
            if(leadChar < 0x80) {
                nBytes = 1;
            } else if((leadChar & 0xe0) == 0xc0) {
                if((str[i-1] & 0xc0) != 0x80) {
                    throw new Exception("Illegal UTF-8 sequence");
                }
                nBytes = 2;
            } else if ((leadChar & 0xf0) == 0xe0) {
                if((str[i-1] & 0xc0) != 0x80 ||
                    (str[i-2] & 0xc0) != 0x80 ) {
                    throw new Exception("Illegal UTF-8 sequence");
                }
                nBytes = 3;
            } else if ((leadChar & 0xf8) == 0xf0) {
                if((str[i-1] & 0xc0) != 0x80 ||
                    (str[i-2] & 0xc0) != 0x80 ||
                    (str[i-3] & 0xc0) != 0x80  ) {
                    throw new Exception("Illegal UTF-8 sequence");
                }
                nBytes = 4;
            } else {
                throw new Exception("Illegal UTF-8 sequence");
            }

            //  now, reverse the current sequence and then continue
            //  whith the next one
            int back    = i;
            int front   = back - nBytes + 1;

            while(front < back) {
                byte tmp = str[front];
                str[front] = str[back];
                str[back] = tmp;
                front++;
                back--;
            }
            i -= nBytes;
        }
    }
} 

void reverse( char *start, char *end )
{
    while( start < end )
    {
        char c = *start;
        *start++ = *end;
        *end-- = c;
    }
}

char *reverse_char( char *start )
{
    char *end = start;
    while( (end[1] & 0xC0) == 0x80 ) end++;
    reverse( start, end );
    return( end+1 );
}

void reverse_string( char *string )
{
    char *end = string;
    while( *end ) end = reverse_char( end );
    reverse( string, end-1 );
}

#include "string.h"

void utf8rev(char *str)
{
    /* this assumes that str is valid UTF-8 */
    char    *scanl, *scanr, *scanr2, c;

    /* first reverse the string */
    for (scanl= str, scanr= str + strlen(str); scanl < scanr;)
        c= *scanl, *scanl++= *--scanr, *scanr= c;

    /* then scan all bytes and reverse each multibyte character */
    for (scanl= scanr= str; c= *scanr++;) {
        if ( (c & 0x80) == 0) // ASCII char
            scanl= scanr;
        else if ( (c & 0xc0) == 0xc0 ) { // start of multibyte
            scanr2= scanr;
            switch (scanr - scanl) {
                case 4: c= *scanl, *scanl++= *--scanr, *scanr= c; // fallthrough
                case 3: // fallthrough
                case 2: c= *scanl, *scanl++= *--scanr, *scanr= c;
            }
            scanr= scanl= scanr2;
        }
    }
}

// quick and dirty main for testing purposes
#include "stdio.h"

int main(int argc, char* argv[])
{
    char buffer[256];
    buffer[sizeof(buffer)-1]= '\0';

    while (--argc > 0) {
        strncpy(buffer, argv[argc], sizeof(buffer)-1); // don't overwrite final null
        printf("%s → ", buffer);
        utf8rev(buffer);
        printf("%s\n", buffer);
    }
    return 0;
}

$ so199260 γεια και χαρά français АДЖИ a♠♡♢♣b
a♠♡♢♣b → b♣♢♡♠a
АДЖИ → ИЖДА
français → siaçnarf
χαρά → άραχ
και → ιακ
γεια → αιεγ