C++ bit hack vs循环内的条件语句_C++_C

C++ bit hack vs循环内的条件语句

c++ c

C++ bit hack vs循环内的条件语句,c++,c,C++,C,我有一个CRC计算函数，在其内部循环中有以下内容： if (uMsgByte & 0x80) crc ^= *pChkTableOffset; pChkTableOffset++; if (uMsgByte & 0x40) crc ^= *pChkTableOffset; pChkTableOffset++; if (uMsgByte & 0x20) crc ^= *pChkTableOffset; pChkTableOffset++; if (uMsgByte &

我有一个CRC计算函数，在其内部循环中有以下内容：

if (uMsgByte & 0x80) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x40) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x20) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x10) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x08) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x04) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x02) crc ^= *pChkTableOffset; pChkTableOffset++;
if (uMsgByte & 0x01) crc ^= *pChkTableOffset; pChkTableOffset++;

分析表明，在这些语句上花费了大量时间。我想知道我是否可以通过用“bit hacks”替换条件句来获得一些收益。我尝试了以下方法，但速度没有提高：

crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x80) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x40) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x20) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x10) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x08) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x04) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x02) - 1);
crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x01) - 1);

这在最近的x86 CPU上应该更快，还是有更好的方法来实现这些“位攻击”？

我不能确定哪种更快，但它们肯定不同——哪种更快在很大程度上取决于所使用的处理器品牌和型号，因为它们在[可能不可预测的]分支上的行为不同。更复杂的是，不同的处理器对“依赖计算”有不同的行为

我将发布的代码转换为以下代码（这使生成的代码大约有原来的一半长，但在概念层面上完全相同）：

并使用

clang++-S-O2

编译：

func1:

_Z5func1jPh:                            # @_Z5func1jPh
        xorl    %eax, %eax
        testb   %dil, %dil
        jns     .LBB0_2
        movzbl  (%rsi), %eax
.LBB0_2:                                # %if.end
        testb   $64, %dil
        je      .LBB0_4
        movzbl  1(%rsi), %ecx
        xorl    %ecx, %eax
.LBB0_4:                                # %if.end.6
        testb   $32, %dil
        je      .LBB0_6
        movzbl  2(%rsi), %ecx
        xorl    %ecx, %eax
.LBB0_6:                                # %if.end.13
        testb   $16, %dil
        je      .LBB0_8
        movzbl  3(%rsi), %ecx
        xorl    %ecx, %eax
.LBB0_8:                                # %if.end.20
        retq

_Z5func2jPh:                            # @_Z5func2jPh
        movzbl  (%rsi), %eax
        movl    %edi, %ecx
        shll    $24, %ecx
        sarl    $31, %ecx
        andl    %eax, %ecx
        movzbl  1(%rsi), %eax
        movl    %edi, %edx
        shll    $25, %edx
        sarl    $31, %edx
        andl    %edx, %eax
        xorl    %ecx, %eax
        movzbl  2(%rsi), %ecx
        movl    %edi, %edx
        shll    $26, %edx
        sarl    $31, %edx
        andl    %ecx, %edx
        movzbl  3(%rsi), %ecx
        shll    $27, %edi
        sarl    $31, %edi
        andl    %ecx, %edi
        xorl    %edx, %edi
        xorl    %edi, %eax
        retq

func2:

_Z5func1jPh:                            # @_Z5func1jPh
        xorl    %eax, %eax
        testb   %dil, %dil
        jns     .LBB0_2
        movzbl  (%rsi), %eax
.LBB0_2:                                # %if.end
        testb   $64, %dil
        je      .LBB0_4
        movzbl  1(%rsi), %ecx
        xorl    %ecx, %eax
.LBB0_4:                                # %if.end.6
        testb   $32, %dil
        je      .LBB0_6
        movzbl  2(%rsi), %ecx
        xorl    %ecx, %eax
.LBB0_6:                                # %if.end.13
        testb   $16, %dil
        je      .LBB0_8
        movzbl  3(%rsi), %ecx
        xorl    %ecx, %eax
.LBB0_8:                                # %if.end.20
        retq

_Z5func2jPh:                            # @_Z5func2jPh
        movzbl  (%rsi), %eax
        movl    %edi, %ecx
        shll    $24, %ecx
        sarl    $31, %ecx
        andl    %eax, %ecx
        movzbl  1(%rsi), %eax
        movl    %edi, %edx
        shll    $25, %edx
        sarl    $31, %edx
        andl    %edx, %eax
        xorl    %ecx, %eax
        movzbl  2(%rsi), %ecx
        movl    %edi, %edx
        shll    $26, %edx
        sarl    $31, %edx
        andl    %ecx, %edx
        movzbl  3(%rsi), %ecx
        shll    $27, %edi
        sarl    $31, %edi
        andl    %ecx, %edi
        xorl    %edx, %edi
        xorl    %edi, %eax
        retq

如您所见，编译器为第一个版本生成分支，并在第二个版本上使用逻辑操作—每种情况下会多一些

我可以编写一些代码来对每个循环进行基准测试，但我保证结果在不同版本的x86处理器之间会有很大差异

我不确定这是否是一种常见的CRC计算，但大多数CRC计算都有优化版本，使用表格和其他“聪明的东西”以更快的方式执行正确的计算。

我想看看人类是否能打败优化编译器，我用两种方式编写了您的算法：

在这里，您可以像编写机器代码一样表达意图

std::uint32_t foo1(std::uint8_t uMsgByte, 
                   std::uint32_t crc, 
                   const std::uint32_t* pChkTableOffset)
{
    if (uMsgByte & 0x80) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x40) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x20) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x10) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x08) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x04) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x02) crc ^= *pChkTableOffset; pChkTableOffset++;
    if (uMsgByte & 0x01) crc ^= *pChkTableOffset; pChkTableOffset++;

    return crc;
}

在这里，我用一种更加算法化的方式来表达我的意图

std::uint32_t foo2(std::uint8_t uMsgByte, 
                   std::uint32_t crc, 
                   const std::uint32_t* pChkTableOffset)
{
    for (int i = 0 ; i < 7 ; ++i) {
        if (uMsgByte & (0x01 << (7-i)))
            crc ^= pChkTableOffset[i];

    }
    return crc;
}

如果编译器在使用逻辑运算而不是条件语句的代码版本中也能表现得如此出色，那将是一件有趣的事情

鉴于：

std::uint32_t logical1(std::uint8_t uMsgByte, 
                       std::uint32_t crc, 
                       const std::uint32_t* pChkTableOffset)
{
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x80) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x40) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x20) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x10) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x8) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x4) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x2) - 1);
    crc ^= *pChkTableOffset++ & (!(uMsgByte & 0x1) - 1);

    return crc;
}

产生的机器代码为：

8批：

    movl    %edi, %eax     ; get uMsgByte into eax
    shll    $24, %eax      ; shift it left 24 bits so that bit 7 is in the sign bit
    sarl    $31, %eax      ; arithmetic shift right to copy the sign bit into all other bits
    andl    (%rdx), %eax   ; and the result with the value from the table
    xorl    %esi, %eax     ; exclusive-or into crc

因此，简短的回答是肯定的-它的性能非常好（省去了pChkTableOffset的冗余增量）

速度快吗？谁知道呢。可能无法测量-在这两种情况下，获取内存的次数是相同的。编译器可以计算出避免分支是否比您更好（取决于编译器优化的体系结构）

它是否更优雅、更具可读性？对我自己来说，不是。这是我在以下情况下编写的代码：

c语言还是一门年轻的语言
处理器非常简单，我可以更好地优化
处理器太慢了，我不得不

这些都不再适用。

如果这个校验和确实是CRC，那么有一种更有效的方法来实现它

假设它是CRC16：

标题：

class CRC16
{
public:
    CRC16(const unsigned short poly);
    unsigned short CalcCRC(unsigned char * pbuf, int len);

protected:
    unsigned short CRCTab[256];
    unsigned long SwapBits(unsigned long swap, int bits);
};

实施：

CRC16::CRC16(const unsigned short poly)
{
    for(int i = 0; i < 256; i++) {
        CRCTab[i] = SwapBits(i, 8) << 8;
        for(int j = 0; j < 8; j++)
            CRCTab[i] = (CRCTab[i] << 1) ^ ((CRCTab[i] & 0x8000) ? poly : 0);
        CRCTab[i] = SwapBits(CRCTab[i], 16);
    }
}

unsigned long CRC16::SwapBits(unsigned long swap, int bits)
{
    unsigned long r = 0;
    for(int i = 0; i < bits; i++) {
        if(swap & 1) r |= 1 << (bits - i - 1);
        swap >>= 1;
    }
    return r;
}

unsigned short CRC16::CalcCRC(unsigned char * pbuf, int len)
{
    unsigned short r = 0;
    while(len--) r = (r >> 8) ^ CRCTab[(r & 0xFF) ^ *(pbuf++)];
    return r;
}

CRC16:：CRC16（常量无符号短多边形）
{
对于（int i=0；i<256；i++）{
CRCTab[i]=SwapBits（i，8）>8）^CRCTab[（r&0xFF）^*（pbuf++）]；
返回r；
}

如您所见，消息的每个字节只使用一次，而不是8次

CRC8也有一个类似的实现。

出于兴趣，扩展了alain关于预计算CRC表的优秀建议，我想到可以修改该类以利用c++14的

constexpr

：

#include <iostream>
#include <utility>
#include <string>

class CRC16
{
private:

    // the storage for the CRC table, to be computed at compile time
    unsigned short CRCTab[256];

    // private template-expanded constructor allows folded calls to SwapBits at compile time
    template<std::size_t...Is>
    constexpr CRC16(const unsigned short poly, std::integer_sequence<std::size_t, Is...>)
    : CRCTab { SwapBits(Is, 8) << 8 ... }
    {}

    // swap bits at compile time
    static constexpr unsigned long SwapBits(unsigned long swap, int bits)
    {
        unsigned long r = 0;
        for(int i = 0; i < bits; i++) {
            if(swap & 1) r |= 1 << (bits - i - 1);
            swap >>= 1;
        }
        return r;
    }


public:

    // public constexpr defers to private template expansion...
    constexpr CRC16(const unsigned short poly)
    : CRC16(poly, std::make_index_sequence<256>())
    {
        //... and then modifies the table - at compile time
        for(int i = 0; i < 256; i++) {
            for(int j = 0; j < 8; j++)
                CRCTab[i] = (CRCTab[i] << 1) ^ ((CRCTab[i] & 0x8000) ? poly : 0);
            CRCTab[i] = SwapBits(CRCTab[i], 16);
        }
    }

    // made const so that we can instantiate constexpr CRC16 objects
    unsigned short CalcCRC(const unsigned char * pbuf, int len) const
    {
        unsigned short r = 0;
        while(len--) r = (r >> 8) ^ CRCTab[(r & 0xFF) ^ *(pbuf++)];
        return r;
    }

};



int main()
{
    // create my constexpr CRC16 object at compile time
    constexpr CRC16 crctab(1234);

    // caclulate the CRC of something...
    using namespace std;
    auto s = "hello world"s;

    auto crc = crctab.CalcCRC(reinterpret_cast<const unsigned char*>(s.data()), s.size());

    cout << crc << endl;

    return 0;
}

整个字符串的CRC计算如下：

        leaq    __ZZ4mainE6crctab(%rip), %rdi ; <- referencing const data :)
        movzwl  (%rdi,%rdx,2), %edx
        jmp     LBB0_8
LBB0_4:
        xorl    %edx, %edx
        jmp     LBB0_11
LBB0_6:
        xorl    %edx, %edx
LBB0_8:                                 ## %.lr.ph.i.preheader.split
        testl   %esi, %esi
        je      LBB0_11
## BB#9:
        leaq    __ZZ4mainE6crctab(%rip), %rsi
        .align  4, 0x90
LBB0_10:                                ## %.lr.ph.i
                                        ## =>This Inner Loop Header: Depth=1
        movzwl  %dx, %edi
        movzbl  %dh, %edx  # NOREX
        movzbl  %dil, %edi
        movzbl  (%rcx), %ebx
        xorq    %rdi, %rbx
        xorw    (%rsi,%rbx,2), %dx
        movzwl  %dx, %edi
        movzbl  %dh, %edx  # NOREX
        movzbl  %dil, %edi
        movzbl  1(%rcx), %ebx
        xorq    %rdi, %rbx
        xorw    (%rsi,%rbx,2), %dx
        addq    $2, %rcx
        addl    $-2, %eax
        jne     LBB0_10
LBB0_11:

leaq\uuzz4maine6crctab（%rip），%rdi；此内部循环头：深度=1
movzwl%dx，%edi
movzbl%dh，%edx#NOREX
movzbl%dil，%edi
movzbl（%rcx），%ebx
xorq%rdi，%rbx
xorw（%rsi，%rbx，2），%dx
movzwl%dx，%edi
movzbl%dh，%edx#NOREX
movzbl%dil，%edi
movzbl 1（%rcx），%ebx
xorq%rdi，%rbx
xorw（%rsi，%rbx，2），%dx
加成$2，%rcx
地址$-2，%eax
jne LBB0_10
LBB0_11：

您应该比较asm输出，看看是否有任何差异。我将编译并显示差异……但我怀疑差异有多大……如果您只对x86感兴趣，并认真考虑对此进行优化，那么我建议在2015年使用SSE.lol bit hacks。感谢您的深入了解。看看编译器是否也能执行，这将很有趣使用逻辑运算而不是条件语句的代码版本非常好。在CRC16算法的一个改编版本中提到过您，它是constexprNice，您还没有想到！@alain-您还会发现，如果您按照第一个/最后一个迭代器而不是超过地址/长度。

        leaq    __ZZ4mainE6crctab(%rip), %rdi ; <- referencing const data :)
        movzwl  (%rdi,%rdx,2), %edx
        jmp     LBB0_8
LBB0_4:
        xorl    %edx, %edx
        jmp     LBB0_11
LBB0_6:
        xorl    %edx, %edx
LBB0_8:                                 ## %.lr.ph.i.preheader.split
        testl   %esi, %esi
        je      LBB0_11
## BB#9:
        leaq    __ZZ4mainE6crctab(%rip), %rsi
        .align  4, 0x90
LBB0_10:                                ## %.lr.ph.i
                                        ## =>This Inner Loop Header: Depth=1
        movzwl  %dx, %edi
        movzbl  %dh, %edx  # NOREX
        movzbl  %dil, %edi
        movzbl  (%rcx), %ebx
        xorq    %rdi, %rbx
        xorw    (%rsi,%rbx,2), %dx
        movzwl  %dx, %edi
        movzbl  %dh, %edx  # NOREX
        movzbl  %dil, %edi
        movzbl  1(%rcx), %ebx
        xorq    %rdi, %rbx
        xorw    (%rsi,%rbx,2), %dx
        addq    $2, %rcx
        addl    $-2, %eax
        jne     LBB0_10
LBB0_11: