C++ 可以使用goto创建编译器可以'；t在C+中生成+；？_C++

C++ 可以使用goto创建编译器可以'；t在C+中生成+；？

c++

C++ 可以使用goto创建编译器可以'；t在C+中生成+；？,c++,C++,使用gotos而不是oops会导致一系列跳转指令比使用循环时编译器生成的指令更高效吗例如：如果我在switch语句中嵌套了一个while循环，它将嵌套在另一个循环中，而这个循环将嵌套在另一个switch语句中，使用goto是否真的比编译器在使用循环和非goto时生成的跳转指令更聪明？使用goto可能会获得一点速度优势。然而，情况可能恰恰相反。编译器在使用SIMD指令检测和展开循环或优化循环方面已经变得非常出色。您很可能会删除编译器的所有这些优化选项，因为它们不是为优化goto语句而构建的。您

使用gotos而不是oops会导致一系列跳转指令比使用循环时编译器生成的指令更高效吗

例如：如果我在switch语句中嵌套了一个while循环，它将嵌套在另一个循环中，而这个循环将嵌套在另一个switch语句中，使用goto是否真的比编译器在使用循环和非goto时生成的跳转指令更聪明？

使用goto可能会获得一点速度优势。然而，情况可能恰恰相反。编译器在使用SIMD指令检测和展开循环或优化循环方面已经变得非常出色。您很可能会删除编译器的所有这些优化选项，因为它们不是为优化goto语句而构建的。您还可以编写函数来防止gotos。这样可以使编译器内联函数并消除跳转

<>如果你考虑使用GOTO进行优化，我会说，这是一个非常糟糕的想法。用干净的代码表达您的算法，并在以后进行优化。如果您需要更高的性能，请考虑您的数据和对数据的访问。这就是你可以获得或失去性能的关键所在

因为您需要代码来证明这一点，所以我构建了以下示例。我在Intel i7-3537U上使用了gcc 6.3.1和-O3。如果您试图重现该示例，您的结果可能会因编译器或硬件的不同而有所不同

#include <iostream>
#include <array>
#include "ToolBox/Instrumentation/Profiler.hpp"

constexpr size_t kilo = 1024;
constexpr size_t mega = kilo * 1024;

constexpr size_t size = 512*mega;
using container = std::array<char, size>;
enum class Measurements {
    jump,
    loop
};

// Simple vector addition using for loop
void sum(container& result, const container& data1, const container& data2) {
    profile(Measurements::loop);
    for(unsigned int i = 0; i < size; ++i) {
        result[i] = data1[i] + data2[i];
    }
}

//  Simple vector addition using jumps
void sum_jump(container& result, const container& data1, const container& data2) {
    profile(Measurements::jump);
    unsigned int i = 0;
label:
    result[i] = data1[i] + data2[i];
    i++;
    if(i == size) goto label;
}

int main() {
    // This segment is just for benchmarking purposes
    // Just ignore this
    ToolBox::Instrumentation::Profiler<Measurements, std::chrono::nanoseconds, 2> profiler(
        std::cout,
        {
            {Measurements::jump, "jump"},
            {Measurements::loop, "loop"}
        }
    );


    // allocate memory to execute our sum functions on
    container data1, data2, result;

    // run the benchmark 100 times to account for caching of the data
    for(unsigned i = 0; i < 100; i++) {

    sum_jump(result, data1, data2);
    sum(result, data1, data2);

    }
}

好的，我们看到运行时中并没有时间差，因为我们在内存带宽上受到限制，而不是cpu指令上。但让我们看看生成的汇编程序指令：

Dump of assembler code for function sum(std::array<char, 536870912ul>&, std::array<char, 536870912ul> const&, std::array<char, 536870912ul> const&):
   0x00000000004025c0 <+0>:     push   %r15
   0x00000000004025c2 <+2>:     push   %r14
   0x00000000004025c4 <+4>:     push   %r12
   0x00000000004025c6 <+6>:     push   %rbx
   0x00000000004025c7 <+7>:     push   %rax
   0x00000000004025c8 <+8>:     mov    %rdx,%r15
   0x00000000004025cb <+11>:    mov    %rsi,%r12
   0x00000000004025ce <+14>:    mov    %rdi,%rbx
   0x00000000004025d1 <+17>:    callq  0x402110 <_ZNSt6chrono3_V212system_clock3nowEv@plt>
   0x00000000004025d6 <+22>:    mov    %rax,%r14
   0x00000000004025d9 <+25>:    lea    0x20000000(%rbx),%rcx
   0x00000000004025e0 <+32>:    lea    0x20000000(%r12),%rax
   0x00000000004025e8 <+40>:    lea    0x20000000(%r15),%rsi
   0x00000000004025ef <+47>:    cmp    %rax,%rbx
   0x00000000004025f2 <+50>:    sbb    %al,%al
   0x00000000004025f4 <+52>:    cmp    %rcx,%r12
   0x00000000004025f7 <+55>:    sbb    %dl,%dl
   0x00000000004025f9 <+57>:    and    %al,%dl
   0x00000000004025fb <+59>:    cmp    %rsi,%rbx
   0x00000000004025fe <+62>:    sbb    %al,%al
   0x0000000000402600 <+64>:    cmp    %rcx,%r15
   0x0000000000402603 <+67>:    sbb    %cl,%cl
   0x0000000000402605 <+69>:    test   $0x1,%dl
   0x0000000000402608 <+72>:    jne    0x40268b <sum(std::array<char, 536870912ul>&, std::array<char, 536870912ul> const&, std::array<char, 536870912ul> const&)+203>
   0x000000000040260e <+78>:    and    %cl,%al
   0x0000000000402610 <+80>:    and    $0x1,%al
   0x0000000000402612 <+82>:    jne    0x40268b <sum(std::array<char, 536870912ul>&, std::array<char, 536870912ul> const&, std::array<char, 536870912ul> const&)+203>
   0x0000000000402614 <+84>:    xor    %eax,%eax
   0x0000000000402616 <+86>:    nopw   %cs:0x0(%rax,%rax,1)
   0x0000000000402620 <+96>:    movdqu (%r12,%rax,1),%xmm0
   0x0000000000402626 <+102>:   movdqu 0x10(%r12,%rax,1),%xmm1
   0x000000000040262d <+109>:   movdqu (%r15,%rax,1),%xmm2
   0x0000000000402633 <+115>:   movdqu 0x10(%r15,%rax,1),%xmm3
   0x000000000040263a <+122>:   paddb  %xmm0,%xmm2
   0x000000000040263e <+126>:   paddb  %xmm1,%xmm3
   0x0000000000402642 <+130>:   movdqu %xmm2,(%rbx,%rax,1)
   0x0000000000402647 <+135>:   movdqu %xmm3,0x10(%rbx,%rax,1)
   0x000000000040264d <+141>:   movdqu 0x20(%r12,%rax,1),%xmm0
   0x0000000000402654 <+148>:   movdqu 0x30(%r12,%rax,1),%xmm1
   0x000000000040265b <+155>:   movdqu 0x20(%r15,%rax,1),%xmm2
   0x0000000000402662 <+162>:   movdqu 0x30(%r15,%rax,1),%xmm3
   0x0000000000402669 <+169>:   paddb  %xmm0,%xmm2
   0x000000000040266d <+173>:   paddb  %xmm1,%xmm3
   0x0000000000402671 <+177>:   movdqu %xmm2,0x20(%rbx,%rax,1)
   0x0000000000402677 <+183>:   movdqu %xmm3,0x30(%rbx,%rax,1)
   0x000000000040267d <+189>:   add    $0x40,%rax
   0x0000000000402681 <+193>:   cmp    $0x20000000,%rax
   0x0000000000402687 <+199>:   jne    0x402620 <sum(std::array<char, 536870912ul>&, std::array<char, 536870912ul> const&, std::array<char, 536870912ul> const&)+96>
   0x0000000000402689 <+201>:   jmp    0x4026d5 <sum(std::array<char, 536870912ul>&, std::array<char, 536870912ul> const&, std::array<char, 536870912ul> const&)+277>
   0x000000000040268b <+203>:   xor    %eax,%eax
   0x000000000040268d <+205>:   nopl   (%rax)
   0x0000000000402690 <+208>:   movzbl (%r15,%rax,1),%ecx
   0x0000000000402695 <+213>:   add    (%r12,%rax,1),%cl
   0x0000000000402699 <+217>:   mov    %cl,(%rbx,%rax,1)
   0x000000000040269c <+220>:   movzbl 0x1(%r15,%rax,1),%ecx
   0x00000000004026a2 <+226>:   add    0x1(%r12,%rax,1),%cl
   0x00000000004026a7 <+231>:   mov    %cl,0x1(%rbx,%rax,1)
   0x00000000004026ab <+235>:   movzbl 0x2(%r15,%rax,1),%ecx
   0x00000000004026b1 <+241>:   add    0x2(%r12,%rax,1),%cl
   0x00000000004026b6 <+246>:   mov    %cl,0x2(%rbx,%rax,1)
   0x00000000004026ba <+250>:   movzbl 0x3(%r15,%rax,1),%ecx
   0x00000000004026c0 <+256>:   add    0x3(%r12,%rax,1),%cl
   0x00000000004026c5 <+261>:   mov    %cl,0x3(%rbx,%rax,1)
   0x00000000004026c9 <+265>:   add    $0x4,%rax
   0x00000000004026cd <+269>:   cmp    $0x20000000,%rax
   0x00000000004026d3 <+275>:   jne    0x402690 <sum(std::array<char, 536870912ul>&, std::array<char, 536870912ul> const&, std::array<char, 536870912ul> const&)+208>
   0x00000000004026d5 <+277>:   callq  0x402110 <_ZNSt6chrono3_V212system_clock3nowEv@plt>
   0x00000000004026da <+282>:   sub    %r14,%rax
   0x00000000004026dd <+285>:   add    %rax,0x202b74(%rip)        # 0x605258 <_ZN7ToolBox15Instrumentation6detail19ProfilerMeasurementI12MeasurementsLS3_1EE14totalTimeSpentE>
   0x00000000004026e4 <+292>:   incl   0x202b76(%rip)        # 0x605260 <_ZN7ToolBox15Instrumentation6detail19ProfilerMeasurementI12MeasurementsLS3_1EE10executionsE>
   0x00000000004026ea <+298>:   add    $0x8,%rsp
   0x00000000004026ee <+302>:   pop    %rbx
   0x00000000004026ef <+303>:   pop    %r12
   0x00000000004026f1 <+305>:   pop    %r14
   0x00000000004026f3 <+307>:   pop    %r15
   0x00000000004026f5 <+309>:   retq   
End of assembler dump.

我也尝试了这种矩阵乘法方法，但是gcc足够聪明，可以用goto/标签语法检测矩阵乘法

结论:

即使我们没有看到速度损失，我们看到，gcc不能使用优化，这可以加速计算。在cpu更具挑战性的任务中，这可能会导致性能下降。

使用goto可能会获得较小的速度优势。然而，情况可能恰恰相反。编译器在使用SIMD指令检测和展开循环或优化循环方面已经变得非常出色。您很可能会删除编译器的所有这些优化选项，因为它们不是为优化goto语句而构建的。您还可以编写函数来防止gotos。这样可以使编译器内联函数并消除跳转

#include <iostream>
#include <array>
#include "ToolBox/Instrumentation/Profiler.hpp"

constexpr size_t kilo = 1024;
constexpr size_t mega = kilo * 1024;

constexpr size_t size = 512*mega;
using container = std::array<char, size>;
enum class Measurements {
    jump,
    loop
};

// Simple vector addition using for loop
void sum(container& result, const container& data1, const container& data2) {
    profile(Measurements::loop);
    for(unsigned int i = 0; i < size; ++i) {
        result[i] = data1[i] + data2[i];
    }
}

//  Simple vector addition using jumps
void sum_jump(container& result, const container& data1, const container& data2) {
    profile(Measurements::jump);
    unsigned int i = 0;
label:
    result[i] = data1[i] + data2[i];
    i++;
    if(i == size) goto label;
}

int main() {
    // This segment is just for benchmarking purposes
    // Just ignore this
    ToolBox::Instrumentation::Profiler<Measurements, std::chrono::nanoseconds, 2> profiler(
        std::cout,
        {
            {Measurements::jump, "jump"},
            {Measurements::loop, "loop"}
        }
    );


    // allocate memory to execute our sum functions on
    container data1, data2, result;

    // run the benchmark 100 times to account for caching of the data
    for(unsigned i = 0; i < 100; i++) {

    sum_jump(result, data1, data2);
    sum(result, data1, data2);

    }
}

好的，我们看到运行时中并没有时间差，因为我们在内存带宽上受到限制，而不是cpu指令上。但让我们看看生成的汇编程序指令：

Dump of assembler code for function sum(std::array<char, 536870912ul>&, std::array<char, 536870912ul> const&, std::array<char, 536870912ul> const&):
   0x00000000004025c0 <+0>:     push   %r15
   0x00000000004025c2 <+2>:     push   %r14
   0x00000000004025c4 <+4>:     push   %r12
   0x00000000004025c6 <+6>:     push   %rbx
   0x00000000004025c7 <+7>:     push   %rax
   0x00000000004025c8 <+8>:     mov    %rdx,%r15
   0x00000000004025cb <+11>:    mov    %rsi,%r12
   0x00000000004025ce <+14>:    mov    %rdi,%rbx
   0x00000000004025d1 <+17>:    callq  0x402110 <_ZNSt6chrono3_V212system_clock3nowEv@plt>
   0x00000000004025d6 <+22>:    mov    %rax,%r14
   0x00000000004025d9 <+25>:    lea    0x20000000(%rbx),%rcx
   0x00000000004025e0 <+32>:    lea    0x20000000(%r12),%rax
   0x00000000004025e8 <+40>:    lea    0x20000000(%r15),%rsi
   0x00000000004025ef <+47>:    cmp    %rax,%rbx
   0x00000000004025f2 <+50>:    sbb    %al,%al
   0x00000000004025f4 <+52>:    cmp    %rcx,%r12
   0x00000000004025f7 <+55>:    sbb    %dl,%dl
   0x00000000004025f9 <+57>:    and    %al,%dl
   0x00000000004025fb <+59>:    cmp    %rsi,%rbx
   0x00000000004025fe <+62>:    sbb    %al,%al
   0x0000000000402600 <+64>:    cmp    %rcx,%r15
   0x0000000000402603 <+67>:    sbb    %cl,%cl
   0x0000000000402605 <+69>:    test   $0x1,%dl
   0x0000000000402608 <+72>:    jne    0x40268b <sum(std::array<char, 536870912ul>&, std::array<char, 536870912ul> const&, std::array<char, 536870912ul> const&)+203>
   0x000000000040260e <+78>:    and    %cl,%al
   0x0000000000402610 <+80>:    and    $0x1,%al
   0x0000000000402612 <+82>:    jne    0x40268b <sum(std::array<char, 536870912ul>&, std::array<char, 536870912ul> const&, std::array<char, 536870912ul> const&)+203>
   0x0000000000402614 <+84>:    xor    %eax,%eax
   0x0000000000402616 <+86>:    nopw   %cs:0x0(%rax,%rax,1)
   0x0000000000402620 <+96>:    movdqu (%r12,%rax,1),%xmm0
   0x0000000000402626 <+102>:   movdqu 0x10(%r12,%rax,1),%xmm1
   0x000000000040262d <+109>:   movdqu (%r15,%rax,1),%xmm2
   0x0000000000402633 <+115>:   movdqu 0x10(%r15,%rax,1),%xmm3
   0x000000000040263a <+122>:   paddb  %xmm0,%xmm2
   0x000000000040263e <+126>:   paddb  %xmm1,%xmm3
   0x0000000000402642 <+130>:   movdqu %xmm2,(%rbx,%rax,1)
   0x0000000000402647 <+135>:   movdqu %xmm3,0x10(%rbx,%rax,1)
   0x000000000040264d <+141>:   movdqu 0x20(%r12,%rax,1),%xmm0
   0x0000000000402654 <+148>:   movdqu 0x30(%r12,%rax,1),%xmm1
   0x000000000040265b <+155>:   movdqu 0x20(%r15,%rax,1),%xmm2
   0x0000000000402662 <+162>:   movdqu 0x30(%r15,%rax,1),%xmm3
   0x0000000000402669 <+169>:   paddb  %xmm0,%xmm2
   0x000000000040266d <+173>:   paddb  %xmm1,%xmm3
   0x0000000000402671 <+177>:   movdqu %xmm2,0x20(%rbx,%rax,1)
   0x0000000000402677 <+183>:   movdqu %xmm3,0x30(%rbx,%rax,1)
   0x000000000040267d <+189>:   add    $0x40,%rax
   0x0000000000402681 <+193>:   cmp    $0x20000000,%rax
   0x0000000000402687 <+199>:   jne    0x402620 <sum(std::array<char, 536870912ul>&, std::array<char, 536870912ul> const&, std::array<char, 536870912ul> const&)+96>
   0x0000000000402689 <+201>:   jmp    0x4026d5 <sum(std::array<char, 536870912ul>&, std::array<char, 536870912ul> const&, std::array<char, 536870912ul> const&)+277>
   0x000000000040268b <+203>:   xor    %eax,%eax
   0x000000000040268d <+205>:   nopl   (%rax)
   0x0000000000402690 <+208>:   movzbl (%r15,%rax,1),%ecx
   0x0000000000402695 <+213>:   add    (%r12,%rax,1),%cl
   0x0000000000402699 <+217>:   mov    %cl,(%rbx,%rax,1)
   0x000000000040269c <+220>:   movzbl 0x1(%r15,%rax,1),%ecx
   0x00000000004026a2 <+226>:   add    0x1(%r12,%rax,1),%cl
   0x00000000004026a7 <+231>:   mov    %cl,0x1(%rbx,%rax,1)
   0x00000000004026ab <+235>:   movzbl 0x2(%r15,%rax,1),%ecx
   0x00000000004026b1 <+241>:   add    0x2(%r12,%rax,1),%cl
   0x00000000004026b6 <+246>:   mov    %cl,0x2(%rbx,%rax,1)
   0x00000000004026ba <+250>:   movzbl 0x3(%r15,%rax,1),%ecx
   0x00000000004026c0 <+256>:   add    0x3(%r12,%rax,1),%cl
   0x00000000004026c5 <+261>:   mov    %cl,0x3(%rbx,%rax,1)
   0x00000000004026c9 <+265>:   add    $0x4,%rax
   0x00000000004026cd <+269>:   cmp    $0x20000000,%rax
   0x00000000004026d3 <+275>:   jne    0x402690 <sum(std::array<char, 536870912ul>&, std::array<char, 536870912ul> const&, std::array<char, 536870912ul> const&)+208>
   0x00000000004026d5 <+277>:   callq  0x402110 <_ZNSt6chrono3_V212system_clock3nowEv@plt>
   0x00000000004026da <+282>:   sub    %r14,%rax
   0x00000000004026dd <+285>:   add    %rax,0x202b74(%rip)        # 0x605258 <_ZN7ToolBox15Instrumentation6detail19ProfilerMeasurementI12MeasurementsLS3_1EE14totalTimeSpentE>
   0x00000000004026e4 <+292>:   incl   0x202b76(%rip)        # 0x605260 <_ZN7ToolBox15Instrumentation6detail19ProfilerMeasurementI12MeasurementsLS3_1EE10executionsE>
   0x00000000004026ea <+298>:   add    $0x8,%rsp
   0x00000000004026ee <+302>:   pop    %rbx
   0x00000000004026ef <+303>:   pop    %r12
   0x00000000004026f1 <+305>:   pop    %r14
   0x00000000004026f3 <+307>:   pop    %r15
   0x00000000004026f5 <+309>:   retq   
End of assembler dump.

我也尝试了这种矩阵乘法方法，但是gcc足够聪明，可以用goto/标签语法检测矩阵乘法

结论:

即使我们没有看到速度损失，我们看到，gcc不能使用优化，这可以加速计算。在cpu更具挑战性的任务中，这可能会导致性能下降。

会吗？当然，从理论上讲，如果您使用的是一个特别愚蠢的编译器，或者编译时禁用了优化

实际上，绝对不是。优化编译器在优化循环和开关语句方面没有什么困难。与按照正常规则玩游戏，使用优化器熟悉的循环结构并编程进行相应优化相比，您更可能将优化器与无条件跳转混淆

这只是回到了一个一般规则，即优化器最好地使用标准的惯用代码，因为这是他们经过培训要识别的。考虑到模式匹配的广泛使用，如果您偏离了正常模式，您就不太可能获得最佳的代码输出

例如：如果我在switch语句中嵌套了一个while循环，它将嵌套在另一个循环中，而这个循环将嵌套在另一个switch语句中

是的，读了你对代码的描述我很困惑，但编译器不会。与人类不同，编译器在嵌套循环方面没有问题。你不会把它混淆，因为它写了有效的C++代码，除非编译器有bug，在这种情况下，所有的赌注都被关闭了。不管有多少嵌套，如果逻辑结果是跳出整个块，那么编译器将发出这样做的代码，就像您自己编写了一个goto一样。如果您试图编写自己的goto，则由于作用域问题，更可能得到次优代码，这将要求编译器发出保存局部变量、调整堆栈帧等的代码。有许多标准优化通常在循环上执行，然而，如果你在这个循环中抛出一个goto，那么它要么是不可能的，要么就是根本不被应用

此外，即使跳过其他指令，跳转也不会总是导致更快的代码。在许多情况下，在现代、高度流水线化、无序的处理器上，预测失误的分支会造成严重的损失，远远超过直接执行干预指令并丢弃结果（忽略）的情况。针对特定平台的优化编译器知道这一点，并可能决定为其发出无分支代码

gdb -batch -ex 'file a.out' -ex 'disassemble sum'

double calcsum( double* val, unsigned int count )
{
    double sum = 0;
    for ( unsigned int j=0; j<count; ++count ) {
        sum += val[j];
    }
    return sum;
}

clang++ -S -emit-llvm -O3 test.cpp   # will create test.ll

$ cat test.ll


define double @_Z7calcsumPdj(double* nocapture readonly, i32) 
local_unnamed_addr #0 {
  %3 = icmp eq i32 %1, 0
  br i1 %3, label %26, label %4

; <label>:4:                                      ; preds = %2
  %5 = load double, double* %0, align 8, !tbaa !1
  %6 = sub i32 0, %1
  %7 = and i32 %6, 7
  %8 = icmp ugt i32 %1, -8
  br i1 %8, label %12, label %9

; <label>:9:                                      ; preds = %4
  %10 = sub i32 %6, %7
  br label %28

; <label>:11:                                     ; preds = %28
  br label %12

; <label>:12:                                     ; preds = %11, %4
  %13 = phi double [ undef, %4 ], [ %38, %11 ]
  %14 = phi double [ 0.000000e+00, %4 ], [ %38, %11 ]
  %15 = icmp eq i32 %7, 0
  br i1 %15, label %24, label %16

; <label>:16:                                     ; preds = %12
  br label %17

; <label>:17:                                     ; preds = %17, %16
  %18 = phi double [ %14, %16 ], [ %20, %17 ]
  %19 = phi i32 [ %7, %16 ], [ %21, %17 ]
  %20 = fadd double %18, %5
  %21 = add i32 %19, -1
  %22 = icmp eq i32 %21, 0
  br i1 %22, label %23, label %17, !llvm.loop !5

; <label>:23:                                     ; preds = %17
  br label %24

; <label>:24:                                     ; preds = %12, %23
  %25 = phi double [ %13, %12 ], [ %20, %23 ]
  br label %26

; <label>:26:                                     ; preds = %24, %2
  %27 = phi double [ 0.000000e+00, %2 ], [ %25, %24 ]
  ret double %27

; <label>:28:                                     ; preds = %28, %9
  %29 = phi double [ 0.000000e+00, %9 ], [ %38, %28 ]
  %30 = phi i32 [ %10, %9 ], [ %39, %28 ]
  %31 = fadd double %29, %5
  %32 = fadd double %31, %5
  %33 = fadd double %32, %5
  %34 = fadd double %33, %5
  %35 = fadd double %34, %5
  %36 = fadd double %35, %5
  %37 = fadd double %36, %5
  %38 = fadd double %37, %5
  %39 = add i32 %30, -8
  %40 = icmp eq i32 %39, 0
  br i1 %40, label %11, label %28
}