C++ 为什么元素添加在单独循环中比在组合循环中快得多?

C++ 为什么元素添加在单独循环中比在组合循环中快得多?,c++,performance,x86,vectorization,compiler-optimization,C++,Performance,X86,Vectorization,Compiler Optimization,假设a1,b1,c1和d1指向堆内存,我的数字代码有以下核心循环 const int n = 100000; for (int j = 0; j < n; j++) { a1[j] += b1[j]; c1[j] += d1[j]; } 在MS上编译并进行完全优化,在Duo(x64)上启用32位,第一个示例需要5.5秒,而双循环示例只需要1.9秒。我的问题是:(请参考我在底部重新表述的问题) PS:我不确定这是否有帮助: 第一个循环的反汇编基本上如下所示(该块在整个程序

假设
a1
b1
c1
d1
指向堆内存,我的数字代码有以下核心循环

const int n = 100000;

for (int j = 0; j < n; j++) {
    a1[j] += b1[j];
    c1[j] += d1[j];
}
在MS上编译并进行完全优化,在Duo(x64)上启用32位,第一个示例需要5.5秒,而双循环示例只需要1.9秒。我的问题是:(请参考我在底部重新表述的问题)

PS:我不确定这是否有帮助:

第一个循环的反汇编基本上如下所示(该块在整个程序中重复大约五次):

双循环示例的每个循环都会生成此代码(以下代码块重复大约三次):

这个问题与此无关,因为行为严重依赖于阵列(n)和CPU缓存的大小。因此,如果有进一步的兴趣,我将重新表述这个问题:

您能否深入了解导致不同缓存行为的细节,如下图中的五个区域所示?

通过为这些CPU提供类似的图表,指出CPU/缓存体系结构之间的差异也可能是有趣的。

PPS:这是完整的代码。它使用
Tick_Count
进行更高分辨率的计时,可以通过不定义
TBB_计时
宏来禁用:

#include <iostream>
#include <iomanip>
#include <cmath>
#include <string>

//#define TBB_TIMING

#ifdef TBB_TIMING   
#include <tbb/tick_count.h>
using tbb::tick_count;
#else
#include <time.h>
#endif

using namespace std;

//#define preallocate_memory new_cont

enum { new_cont, new_sep };

double *a1, *b1, *c1, *d1;


void allo(int cont, int n)
{
    switch(cont) {
      case new_cont:
        a1 = new double[n*4];
        b1 = a1 + n;
        c1 = b1 + n;
        d1 = c1 + n;
        break;
      case new_sep:
        a1 = new double[n];
        b1 = new double[n];
        c1 = new double[n];
        d1 = new double[n];
        break;
    }

    for (int i = 0; i < n; i++) {
        a1[i] = 1.0;
        d1[i] = 1.0;
        c1[i] = 1.0;
        b1[i] = 1.0;
    }
}

void ff(int cont)
{
    switch(cont){
      case new_sep:
        delete[] b1;
        delete[] c1;
        delete[] d1;
      case new_cont:
        delete[] a1;
    }
}

double plain(int n, int m, int cont, int loops)
{
#ifndef preallocate_memory
    allo(cont,n);
#endif

#ifdef TBB_TIMING   
    tick_count t0 = tick_count::now();
#else
    clock_t start = clock();
#endif
        
    if (loops == 1) {
        for (int i = 0; i < m; i++) {
            for (int j = 0; j < n; j++){
                a1[j] += b1[j];
                c1[j] += d1[j];
            }
        }
    } else {
        for (int i = 0; i < m; i++) {
            for (int j = 0; j < n; j++) {
                a1[j] += b1[j];
            }
            for (int j = 0; j < n; j++) {
                c1[j] += d1[j];
            }
        }
    }
    double ret;

#ifdef TBB_TIMING   
    tick_count t1 = tick_count::now();
    ret = 2.0*double(n)*double(m)/(t1-t0).seconds();
#else
    clock_t end = clock();
    ret = 2.0*double(n)*double(m)/(double)(end - start) *double(CLOCKS_PER_SEC);
#endif
    
#ifndef preallocate_memory
    ff(cont);
#endif

    return ret;
}


void main()
{   
    freopen("C:\\test.csv", "w", stdout);

    char *s = " ";

    string na[2] ={"new_cont", "new_sep"};

    cout << "n";

    for (int j = 0; j < 2; j++)
        for (int i = 1; i <= 2; i++)
#ifdef preallocate_memory
            cout << s << i << "_loops_" << na[preallocate_memory];
#else
            cout << s << i << "_loops_" << na[j];
#endif
            
    cout << endl;

    long long nmax = 1000000;

#ifdef preallocate_memory
    allo(preallocate_memory, nmax);
#endif
    
    for (long long n = 1L; n < nmax; n = max(n+1, long long(n*1.2)))
    {
        const long long m = 10000000/n;
        cout << n;

        for (int j = 0; j < 2; j++)
            for (int i = 1; i <= 2; i++)
                cout << s << plain(n, m, j, i);
        cout << endl;
    }
}
#包括
#包括
#包括
#包括
//#定义TBB_定时
#ifdef TBB_定时
#包括
使用tbb::勾选计数;
#否则
#包括
#恩迪夫
使用名称空间std;
//#定义预分配\u内存新建\u cont
枚举{new_cont,new_sep};
双*a1、*b1、*c1、*d1;
无效分配(内部控制,内部n)
{
开关(续){
新个案(续)
a1=新的双精度[n*4];
b1=a1+n;
c1=b1+n;
d1=c1+n;
打破
新个案(九月)
a1=新的双精度[n];
b1=新的双精度[n];
c1=新的双精度[n];
d1=新的双精度[n];
打破
}
对于(int i=0;icout第二个循环涉及的缓存活动要少得多,因此处理器更容易满足内存需求。

这不是因为不同的代码,而是因为缓存:RAM比CPU寄存器慢,CPU内部有一个缓存,以避免每次变量更改时都写入RAM。但是缓存是RAM并没有那么大,因此,它只映射了它的一小部分

第一个代码在每个循环中交替修改远程内存地址,因此需要不断地使缓存失效


第二个代码不是交替的:它只是在相邻地址上流动两次。这使得所有的作业都在缓存中完成,只有在第二个循环开始后才会使其无效。

这是因为CPU没有太多缓存未命中(它必须等待阵列数据来自RAM芯片)。您有兴趣不断调整数组的大小,使其超过CPU的大小(L1),然后是大小(L2),并根据数组的大小绘制代码执行所需的时间。图形不应像您预期的那样是一条直线。

进一步分析后,我相信这是正确的(至少部分)由四个指针的数据对齐引起。这将导致某些级别的缓存组/路径冲突

如果我猜对了数组的分配方式,那么它们很可能与页面行对齐

这意味着每个循环中的所有访问都将落在相同的缓存方式上。然而,英特尔处理器已经有一段时间具有8路一级缓存关联性。但实际上,性能并不完全一致。4路访问仍然比2路访问慢

编辑:事实上,看起来您是在单独分配所有阵列。 通常,当请求如此大的分配时,分配器将从操作系统请求新的页面。因此,大的分配很有可能出现在页面边界的相同偏移量处

以下是测试代码:

int main(){
    const int n = 100000;

#ifdef ALLOCATE_SEPERATE
    double *a1 = (double*)malloc(n * sizeof(double));
    double *b1 = (double*)malloc(n * sizeof(double));
    double *c1 = (double*)malloc(n * sizeof(double));
    double *d1 = (double*)malloc(n * sizeof(double));
#else
    double *a1 = (double*)malloc(n * sizeof(double) * 4);
    double *b1 = a1 + n;
    double *c1 = b1 + n;
    double *d1 = c1 + n;
#endif

    //  Zero the data to prevent any chance of denormals.
    memset(a1,0,n * sizeof(double));
    memset(b1,0,n * sizeof(double));
    memset(c1,0,n * sizeof(double));
    memset(d1,0,n * sizeof(double));

    //  Print the addresses
    cout << a1 << endl;
    cout << b1 << endl;
    cout << c1 << endl;
    cout << d1 << endl;

    clock_t start = clock();

    int c = 0;
    while (c++ < 10000){

#if ONE_LOOP
        for(int j=0;j<n;j++){
            a1[j] += b1[j];
            c1[j] += d1[j];
        }
#else
        for(int j=0;j<n;j++){
            a1[j] += b1[j];
        }
        for(int j=0;j<n;j++){
            c1[j] += d1[j];
        }
#endif

    }

    clock_t end = clock();
    cout << "seconds = " << (double)(end - start) / CLOCKS_PER_SEC << endl;

    system("pause");
    return 0;
}
意见:

  • 6.206秒有一个循环,2.116秒有两个循环。这精确地再现了OP的结果

  • 在前两个测试中,数组是分开分配的。您会注意到它们相对于页面的对齐方式都相同

  • 在后两个测试中,阵列被打包在一起以进行bre
    addsd       xmm0,mmword ptr [eax+28h]
    movsd       mmword ptr [eax+28h],xmm0
    movsd       xmm0,mmword ptr [ecx+20h]
    addsd       xmm0,mmword ptr [eax+30h]
    movsd       mmword ptr [eax+30h],xmm0
    movsd       xmm0,mmword ptr [ecx+28h]
    addsd       xmm0,mmword ptr [eax+38h]
    movsd       mmword ptr [eax+38h],xmm0
    movsd       xmm0,mmword ptr [ecx+30h]
    addsd       xmm0,mmword ptr [eax+40h]
    movsd       mmword ptr [eax+40h],xmm0
    
    #include <iostream>
    #include <iomanip>
    #include <cmath>
    #include <string>
    
    //#define TBB_TIMING
    
    #ifdef TBB_TIMING   
    #include <tbb/tick_count.h>
    using tbb::tick_count;
    #else
    #include <time.h>
    #endif
    
    using namespace std;
    
    //#define preallocate_memory new_cont
    
    enum { new_cont, new_sep };
    
    double *a1, *b1, *c1, *d1;
    
    
    void allo(int cont, int n)
    {
        switch(cont) {
          case new_cont:
            a1 = new double[n*4];
            b1 = a1 + n;
            c1 = b1 + n;
            d1 = c1 + n;
            break;
          case new_sep:
            a1 = new double[n];
            b1 = new double[n];
            c1 = new double[n];
            d1 = new double[n];
            break;
        }
    
        for (int i = 0; i < n; i++) {
            a1[i] = 1.0;
            d1[i] = 1.0;
            c1[i] = 1.0;
            b1[i] = 1.0;
        }
    }
    
    void ff(int cont)
    {
        switch(cont){
          case new_sep:
            delete[] b1;
            delete[] c1;
            delete[] d1;
          case new_cont:
            delete[] a1;
        }
    }
    
    double plain(int n, int m, int cont, int loops)
    {
    #ifndef preallocate_memory
        allo(cont,n);
    #endif
    
    #ifdef TBB_TIMING   
        tick_count t0 = tick_count::now();
    #else
        clock_t start = clock();
    #endif
            
        if (loops == 1) {
            for (int i = 0; i < m; i++) {
                for (int j = 0; j < n; j++){
                    a1[j] += b1[j];
                    c1[j] += d1[j];
                }
            }
        } else {
            for (int i = 0; i < m; i++) {
                for (int j = 0; j < n; j++) {
                    a1[j] += b1[j];
                }
                for (int j = 0; j < n; j++) {
                    c1[j] += d1[j];
                }
            }
        }
        double ret;
    
    #ifdef TBB_TIMING   
        tick_count t1 = tick_count::now();
        ret = 2.0*double(n)*double(m)/(t1-t0).seconds();
    #else
        clock_t end = clock();
        ret = 2.0*double(n)*double(m)/(double)(end - start) *double(CLOCKS_PER_SEC);
    #endif
        
    #ifndef preallocate_memory
        ff(cont);
    #endif
    
        return ret;
    }
    
    
    void main()
    {   
        freopen("C:\\test.csv", "w", stdout);
    
        char *s = " ";
    
        string na[2] ={"new_cont", "new_sep"};
    
        cout << "n";
    
        for (int j = 0; j < 2; j++)
            for (int i = 1; i <= 2; i++)
    #ifdef preallocate_memory
                cout << s << i << "_loops_" << na[preallocate_memory];
    #else
                cout << s << i << "_loops_" << na[j];
    #endif
                
        cout << endl;
    
        long long nmax = 1000000;
    
    #ifdef preallocate_memory
        allo(preallocate_memory, nmax);
    #endif
        
        for (long long n = 1L; n < nmax; n = max(n+1, long long(n*1.2)))
        {
            const long long m = 10000000/n;
            cout << n;
    
            for (int j = 0; j < 2; j++)
                for (int i = 1; i <= 2; i++)
                    cout << s << plain(n, m, j, i);
            cout << endl;
        }
    }
    
    int main(){
        const int n = 100000;
    
    #ifdef ALLOCATE_SEPERATE
        double *a1 = (double*)malloc(n * sizeof(double));
        double *b1 = (double*)malloc(n * sizeof(double));
        double *c1 = (double*)malloc(n * sizeof(double));
        double *d1 = (double*)malloc(n * sizeof(double));
    #else
        double *a1 = (double*)malloc(n * sizeof(double) * 4);
        double *b1 = a1 + n;
        double *c1 = b1 + n;
        double *d1 = c1 + n;
    #endif
    
        //  Zero the data to prevent any chance of denormals.
        memset(a1,0,n * sizeof(double));
        memset(b1,0,n * sizeof(double));
        memset(c1,0,n * sizeof(double));
        memset(d1,0,n * sizeof(double));
    
        //  Print the addresses
        cout << a1 << endl;
        cout << b1 << endl;
        cout << c1 << endl;
        cout << d1 << endl;
    
        clock_t start = clock();
    
        int c = 0;
        while (c++ < 10000){
    
    #if ONE_LOOP
            for(int j=0;j<n;j++){
                a1[j] += b1[j];
                c1[j] += d1[j];
            }
    #else
            for(int j=0;j<n;j++){
                a1[j] += b1[j];
            }
            for(int j=0;j<n;j++){
                c1[j] += d1[j];
            }
    #endif
    
        }
    
        clock_t end = clock();
        cout << "seconds = " << (double)(end - start) / CLOCKS_PER_SEC << endl;
    
        system("pause");
        return 0;
    }
    
    for(int j=0;j<n;j++){
        a[j] += b[j];
    }
    for(int j=0;j<n;j++){
        c[j] += d[j];
    }
    
    for(int j=0;j<n;j++){
        a[j] += b[j];
        c[j] += d[j];
    }
    
    for(int j=0;j<n;j++){
     a[j] += b[j];
    }
    for(int j=0;j<n;j++){
     c[j] += d[j];
    }
    
    for(int j=0;j<n;j++){
     a[j] += b[j];
     c[j] += d[j];
    }
    
    // MemBufferMystery.cpp : Defines the entry point for the console application.
    //
    #include "stdafx.h"
    #include <iostream>
    #include <cmath>
    #include <string>
    #include <time.h>
    
    #define  dbl    double
    #define  MAX_ARRAY_SZ    262145    //16777216    // AKA (2^24)
    #define  STEP_SZ           1024    //   65536    // AKA (2^16)
    
    int _tmain(int argc, _TCHAR* argv[]) {
        long i, j, ArraySz = 0,  LoopKnt = 1024;
        time_t start, Cumulative_Combined = 0, Cumulative_Separate = 0;
        dbl *a = NULL, *b = NULL, *c = NULL, *d = NULL, *InitToOnes = NULL;
    
        a = (dbl *)calloc( MAX_ARRAY_SZ, sizeof(dbl));
        b = (dbl *)calloc( MAX_ARRAY_SZ, sizeof(dbl));
        c = (dbl *)calloc( MAX_ARRAY_SZ, sizeof(dbl));
        d = (dbl *)calloc( MAX_ARRAY_SZ, sizeof(dbl));
        InitToOnes = (dbl *)calloc( MAX_ARRAY_SZ, sizeof(dbl));
        // Initialize array to 1.0 second.
        for(j = 0; j< MAX_ARRAY_SZ; j++) {
            InitToOnes[j] = 1.0;
        }
    
        // Increase size of arrays and time
        for(ArraySz = STEP_SZ; ArraySz<MAX_ARRAY_SZ; ArraySz += STEP_SZ) {
            a = (dbl *)realloc(a, ArraySz * sizeof(dbl));
            b = (dbl *)realloc(b, ArraySz * sizeof(dbl));
            c = (dbl *)realloc(c, ArraySz * sizeof(dbl));
            d = (dbl *)realloc(d, ArraySz * sizeof(dbl));
            // Outside the timing loop, initialize
            // b and d arrays to 1.0 sec for consistent += performance.
            memcpy((void *)b, (void *)InitToOnes, ArraySz * sizeof(dbl));
            memcpy((void *)d, (void *)InitToOnes, ArraySz * sizeof(dbl));
    
            start = clock();
            for(i = LoopKnt; i; i--) {
                for(j = ArraySz; j; j--) {
                    a[j] += b[j];
                    c[j] += d[j];
                }
            }
            Cumulative_Combined += (clock()-start);
            printf("\n %6i miliseconds for combined array sizes %i and %i loops",
                    (int)(clock()-start), ArraySz, LoopKnt);
            start = clock();
            for(i = LoopKnt; i; i--) {
                for(j = ArraySz; j; j--) {
                    a[j] += b[j];
                }
                for(j = ArraySz; j; j--) {
                    c[j] += d[j];
                }
            }
            Cumulative_Separate += (clock()-start);
            printf("\n %6i miliseconds for separate array sizes %i and %i loops \n",
                    (int)(clock()-start), ArraySz, LoopKnt);
        }
        printf("\n Cumulative combined array processing took %10.3f seconds",
                (dbl)(Cumulative_Combined/(dbl)CLOCKS_PER_SEC));
        printf("\n Cumulative seperate array processing took %10.3f seconds",
            (dbl)(Cumulative_Separate/(dbl)CLOCKS_PER_SEC));
        getchar();
    
        free(a); free(b); free(c); free(d); free(InitToOnes);
        return 0;
    }
    
    const int n=100000;
    
    for(int j=0;j<n;j++){
        a1[j] += b1[j];
        c1[j] += d1[j];
    }
    
    for(int j=0;j<n;j++){
        a1[j] += b1[j];
    }
    for(int j=0;j<n;j++){
        c1[j] += d1[j];
    }
    
    Sum n=1 : [1,100000] = F1(), F2();
                           F1() = { f(a) = f(a) + f(b); }
                           F2() = { f(c) = f(c) + f(d); }
    
    Sum1 n=1 : [1,100000] = F1();
                            F1() = { f(a) = f(a) + f(b); }
    
    Sum2 n=1 : [1,100000] = F1();
                            F1() = { f(c) = f(c) + f(d); }
    
    const n = 100000
    distTraveledOfFirst = (100 + 500) + ((n-1)*(500 + 500);
    // Simplify
    distTraveledOfFirst = 600 + (99999*100);
    distTraveledOfFirst = 600 + 9999900;
    distTraveledOfFirst =  10000500;
    // Distance Traveled On First Algorithm = 10,000,500ft
    
    distTraveledOfSecond = 100 + 500 = 600;
    // Distance Traveled On Second Algorithm = 600ft;
    
    DeltaTimeDifference approximately = Loop1(time) - Loop2(time)
    //where
    Loop1(time) = Loop2(time) + (Loop2(time)*[0.6,0.7]) // approximately
    // So when we substitute this back into the difference equation we end up with
    DeltaTimeDifference approximately = (Loop2(time) + (Loop2(time)*[0.6,0.7])) - Loop2(time)
    // And finally we can simplify this to
    DeltaTimeDifference approximately = [0.6,0.7]*Loop2(time)
    
    struct A {
        int data;
        A() : data{0}{}
        A(int a) : data{a}{}
    };
    struct B {
        int data;
        B() : data{0}{}
        A(int b) : data{b}{}
    }
    
    template<typename T>
    void Foo( T& t ) {
        // Do something with t
    }
    
    // Some looping operation: first stack then heap.
    
    // Stack data:
    A dataSetA[10] = {};
    B dataSetB[10] = {};
    
    // For stack operations this is okay and efficient
    for (int i = 0; i < 10; i++ ) {
       Foo(dataSetA[i]);
       Foo(dataSetB[i]);
    }
    
    // If the above two were on the heap then performing
    // the same algorithm to both within the same loop
    // will create that bottleneck
    A* dataSetA = new [] A();
    B* dataSetB = new [] B();
    for ( int i = 0; i < 10; i++ ) {
        Foo(dataSetA[i]); // dataSetA is on the heap here
        Foo(dataSetB[i]); // dataSetB is on the heap here
    } // this will be inefficient.
    
    // To improve the efficiency above, put them into separate loops...
    
    for (int i = 0; i < 10; i++ ) {
        Foo(dataSetA[i]);
    }
    for (int i = 0; i < 10; i++ ) {
        Foo(dataSetB[i]);
    }
    // This will be much more efficient than above.
    // The code isn't perfect syntax, it's only psuedo code
    // to illustrate a point.