Gcc 预取示例？_Gcc_Optimization_Assembly_Prefetch

Gcc 预取示例？

gcc optimization assembly

Gcc 预取示例？,gcc,optimization,assembly,prefetch,Gcc,Optimization,Assembly,Prefetch,有人能给出一个例子或链接到一个例子，该例子在GCC中使用\u内置预取（或者通常只使用asm指令预取）来获得显著的性能优势吗？特别是，我希望示例满足以下标准：这是一个简单、小、独立的示例删除\u内置\u预取指令会导致性能下降用相应的内存访问替换\u内置\u预取指令会导致性能下降也就是说，我想要一个最短的示例，显示\u内置\u预取执行没有它就无法管理的优化。来自：（i=0；i

有人能给出一个例子或链接到一个例子，该例子在GCC中使用

\u内置预取

（或者通常只使用asm指令预取）来获得显著的性能优势吗？特别是，我希望示例满足以下标准：

这是一个简单、小、独立的示例

删除

\u内置\u预取

指令会导致性能下降

用相应的内存访问替换

\u内置\u预取

指令会导致性能下降

也就是说，我想要一个最短的示例，显示

\u内置\u预取

执行没有它就无法管理的优化。

来自：

（i=0；i


{
a[i]=a[i]+b[i]；
__内置预取（&a[i+j]，1,1）；
__内置预取（&b[i+j]，0,1）；
/* ... */
}

这是我从一个更大的项目中提取的一段实际代码。（对不起，这是我能找到的最短的一个，它的预取速度明显加快。）这段代码执行非常大的数据转置

本例使用SSE预取指令，该指令可能与GCC发出的指令相同

要运行此示例，您需要为x64编译此文件，并拥有超过4GB的内存。您可以使用较小的数据量运行它，但速度太快，无法满足时间要求

#include <iostream>
using std::cout;
using std::endl;

#include <emmintrin.h>
#include <malloc.h>
#include <time.h>
#include <string.h>

#define ENABLE_PREFETCH


#define f_vector    __m128d
#define i_ptr       size_t
inline void swap_block(f_vector *A,f_vector *B,i_ptr L){
    //  To be super-optimized later.

    f_vector *stop = A + L;

    do{
        f_vector tmpA = *A;
        f_vector tmpB = *B;
        *A++ = tmpB;
        *B++ = tmpA;
    }while (A < stop);
}
void transpose_even(f_vector *T,i_ptr block,i_ptr x){
    //  Transposes T.
    //  T contains x columns and x rows.
    //  Each unit is of size (block * sizeof(f_vector)) bytes.

    //Conditions:
    //  - 0 < block
    //  - 1 < x

    i_ptr row_size = block * x;
    i_ptr iter_size = row_size + block;

    //  End of entire matrix.
    f_vector *stop_T = T + row_size * x;
    f_vector *end = stop_T - row_size;

    //  Iterate each row.
    f_vector *y_iter = T;
    do{
        //  Iterate each column.
        f_vector *ptr_x = y_iter + block;
        f_vector *ptr_y = y_iter + row_size;

        do{

#ifdef ENABLE_PREFETCH
            _mm_prefetch((char*)(ptr_y + row_size),_MM_HINT_T0);
#endif

            swap_block(ptr_x,ptr_y,block);

            ptr_x += block;
            ptr_y += row_size;
        }while (ptr_y < stop_T);

        y_iter += iter_size;
    }while (y_iter < end);
}
int main(){

    i_ptr dimension = 4096;
    i_ptr block = 16;

    i_ptr words = block * dimension * dimension;
    i_ptr bytes = words * sizeof(f_vector);

    cout << "bytes = " << bytes << endl;
//    system("pause");

    f_vector *T = (f_vector*)_mm_malloc(bytes,16);
    if (T == NULL){
        cout << "Memory Allocation Failure" << endl;
        system("pause");
        exit(1);
    }
    memset(T,0,bytes);

    //  Perform in-place data transpose
    cout << "Starting Data Transpose...   ";
    clock_t start = clock();
    transpose_even(T,block,dimension);
    clock_t end = clock();

    cout << "Done" << endl;
    cout << "Time: " << (double)(end - start) / CLOCKS_PER_SEC << " seconds" << endl;

    _mm_free(T);
    system("pause");
}

当我在禁用ENABLE_PREFETCH的情况下运行它时，这是输出：

bytes = 4294967296
Starting Data Transpose...   Done
Time: 0.725 seconds
Press any key to continue . . .

bytes = 4294967296
Starting Data Transpose...   Done
Time: 0.822 seconds
Press any key to continue . . .

因此，预取的速度提高了13%

编辑：

以下是更多的结果：

Operating System: Windows 7 Professional/Ultimate
Compiler: Visual Studio 2010 SP1
Compile Mode: x64 Release

Intel Core i7 860 @ 2.8 GHz, 8 GB DDR3 @ 1333 MHz
Prefetch   : 0.868
No Prefetch: 0.960

Intel Core i7 920 @ 3.5 GHz, 12 GB DDR3 @ 1333 MHz
Prefetch   : 0.725
No Prefetch: 0.822

Intel Core i7 2600K @ 4.6 GHz, 16 GB DDR3 @ 1333 MHz
Prefetch   : 0.718
No Prefetch: 0.796

2 x Intel Xeon X5482 @ 3.2 GHz, 64 GB DDR2 @ 800 MHz
Prefetch   : 2.273
No Prefetch: 2.666

二进制搜索是一个简单的例子，可以从显式预取中获益。二进制搜索中的访问模式在硬件预取程序看来几乎是随机的，因此它几乎不可能准确预测要获取什么

在本例中，我预取了当前迭代中下一个循环迭代的两个可能的“中间”位置。其中一个预取可能永远不会被使用，但另一个会被使用（除非这是最终的迭代）

请注意，在预取版本中，一级缓存的加载量是原来的两倍。我们实际上做了很多工作，但是内存访问模式对管道更友好。这也显示了权衡。虽然这段代码单独运行速度更快，但我们已将大量垃圾加载到缓存中，这可能会给应用程序的其他部分带来更大的压力。

我从@JamesScriven和@Mystical提供的优秀答案中学到了很多。然而，他们的例子只提供了一个适度的提升——这个答案的目的是提供一个（我必须承认有些人为的）例子，其中预取有更大的影响（在我的机器上大约是因子4）

现代体系结构有三个可能的瓶颈：CPU速度、内存带宽和内存延迟。预取就是减少内存访问的延迟

在一个完美的场景中，延迟对应于X个计算步骤，我们会有一个oracle，它会告诉我们在X个计算步骤中要访问哪些内存，这些数据的预取会启动，它会在X个计算步骤之后及时到达

对于许多算法来说，我们（几乎）处于这个完美的世界。对于一个简单的For循环，很容易预测X步之后需要哪些数据。无序执行和其他硬件技巧在这里做得非常好，几乎完全隐藏了延迟

这就是为什么@Mystical的示例有如此小的改进的原因：预取器已经相当不错了，只是没有太多的改进空间。这项任务也是内存受限的，所以可能没有多少带宽了——这可能会成为限制因素。我最多能看到我的机器有8%的改进

来自@JamesScriven示例的重要见解是：在从内存中提取当前数据之前，我们和CPU都不知道下一个访问地址——这种依赖关系非常重要，否则无序执行将导致前瞻，硬件将能够预取数据。然而，因为我们只能推测一步，所以没有那么大的潜力。我无法在我的机器上获得超过40%的收益

因此，让我们操纵比赛并准备数据，这样我们就可以在X步中知道访问哪个地址，但由于依赖于尚未访问的数据，硬件无法找到它（请参见答案末尾的整个程序）：

在4到5之间加速

prefetch\u demp.cpp的列表

：

//prefetch_demo.cpp

#include <vector>
#include <iostream>
#include <iomanip>
#include <chrono>

const int SIZE=1024*1024*1;
const int STEP_CNT=1024*1024*10;

unsigned int next(unsigned int current){
   return (current*10001+328)%SIZE;
}


template<bool prefetch>
struct Worker{
   std::vector<int> mem;

   double result;
   int oracle_offset;

   void operator()(){
        unsigned int prefetch_index=0;
        for(int i=0;i<oracle_offset;i++)
            prefetch_index=next(prefetch_index);

        unsigned int index=0;
        for(int i=0;i<STEP_CNT;i++){
            //prefetch memory block used in a future iteration
            if(prefetch){
                __builtin_prefetch(mem.data()+prefetch_index,0,1);    
            }
            //actual work:
            result+=mem[index];

            //prepare next iteration
            prefetch_index=next(prefetch_index);
            index=next(mem[index]);
        }
   }

   Worker(std::vector<int> &mem_):
       mem(mem_), result(0.0), oracle_offset(0)
   {}
};

template <typename Worker>
    double timeit(Worker &worker){
    auto begin = std::chrono::high_resolution_clock::now();
    worker();
    auto end = std::chrono::high_resolution_clock::now();
    return std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count()/1e9;
}


 int main() {
     //set up the data in special way!
     std::vector<int> keys(SIZE);
     for (int i=0;i<SIZE;i++){
       keys[i] = i;
     }

     Worker<false> without_prefetch(keys);
     Worker<true> with_prefetch(keys);

     std::cout<<"#preloops\ttime no prefetch\ttime prefetch\tfactor\n";
     std::cout<<std::setprecision(17);

     for(int i=0;i<20;i++){
         //let oracle see i steps in the future:
         without_prefetch.oracle_offset=i;
         with_prefetch.oracle_offset=i;

         //calculate:
         double time_with_prefetch=timeit(with_prefetch);
         double time_no_prefetch=timeit(without_prefetch);

         std::cout<<i<<"\t"
                  <<time_no_prefetch<<"\t"
                  <<time_with_prefetch<<"\t"
                  <<(time_no_prefetch/time_with_prefetch)<<"\n";
     }

 }

//预取\u demo.cpp
#包括
#包括
#包括
#包括
常量int SIZE=1024*1024*1；
const int STEP_CNT=1024*1024*10；
unsigned int next（unsigned int current）{
返回（当前*10001+328）%SIZE；
}
模板
结构工人{
std：：向量mem；
双重结果；
int oracle_偏移量；
void运算符（）（）{
无符号整数预取索引=0；
对于（int i=0；i预取数据可以优化为缓存线大小，对于大多数现代64位处理器而言，缓存线大小为64字节，例如用一条指令预加载uint32_t[16]
例如，在ArmV8上，我通过实验发现，将内存指针强制转换为uint32_t 4x4矩阵向量（大小为64字节）会使所需的指令减半，因为它只加载了一半的数据，而我的理解是它会获取完整的缓存线，所以我必须将所需的指令增加8
预取uint32_t[32]原始代码示例
int addrindex = &B[0];
    __builtin_prefetch(&V[addrindex]);
    __builtin_prefetch(&V[addrindex + 8]);
    __builtin_prefetch(&V[addrindex + 16]);
    __builtin_prefetch(&V[addrindex + 24]);

在
int addrindex = &B[0];
__builtin_prefetch((uint32x4x4_t *) &V[addrindex]);
__builtin_prefetch((uint32x4x4_t *) &V[addrindex + 16]);

出于某种原因，地址索引/偏移量的int数据类型提供了更好的性能。在Cortex-a53上使用GCC 8进行测试。如果您发现它没有像我的情况那样预取所有数据，那么在其他体系结构上使用等效的64字节向量可能会提供相同的性能改进
//making random accesses to memory:
unsigned int next(unsigned int current){
   return (current*10001+328)%SIZE;
}

//the actual work is happening here
void operator()(){

    //set up the oracle - let see it in the future oracle_offset steps
    unsigned int prefetch_index=0;
    for(int i=0;i<oracle_offset;i++)
        prefetch_index=next(prefetch_index);

    unsigned int index=0;
    for(int i=0;i<STEP_CNT;i++){
        //use oracle and prefetch memory block used in a future iteration
        if(prefetch){
            __builtin_prefetch(mem.data()+prefetch_index,0,1);    
        }

        //actual work, the less the better
        result+=mem[index];

        //prepare next iteration
        prefetch_index=next(prefetch_index);  #update oracle
        index=next(mem[index]);               #dependency on `mem[index]` is VERY important to prevent hardware from predicting future
    }
}

>>> g++ -std=c++11 prefetch_demo.cpp -O3 -o prefetch_demo
>>> ./prefetch_demo
#preloops   time no prefetch    time prefetch   factor
...
7   1.0711102260000001  0.230566831 4.6455521002498408
8   1.0511602149999999  0.22651144600000001 4.6406494398521474
9   1.049024333 0.22841439299999999 4.5926367389641687
....

//prefetch_demo.cpp

#include <vector>
#include <iostream>
#include <iomanip>
#include <chrono>

const int SIZE=1024*1024*1;
const int STEP_CNT=1024*1024*10;

unsigned int next(unsigned int current){
   return (current*10001+328)%SIZE;
}


template<bool prefetch>
struct Worker{
   std::vector<int> mem;

   double result;
   int oracle_offset;

   void operator()(){
        unsigned int prefetch_index=0;
        for(int i=0;i<oracle_offset;i++)
            prefetch_index=next(prefetch_index);

        unsigned int index=0;
        for(int i=0;i<STEP_CNT;i++){
            //prefetch memory block used in a future iteration
            if(prefetch){
                __builtin_prefetch(mem.data()+prefetch_index,0,1);    
            }
            //actual work:
            result+=mem[index];

            //prepare next iteration
            prefetch_index=next(prefetch_index);
            index=next(mem[index]);
        }
   }

   Worker(std::vector<int> &mem_):
       mem(mem_), result(0.0), oracle_offset(0)
   {}
};

template <typename Worker>
    double timeit(Worker &worker){
    auto begin = std::chrono::high_resolution_clock::now();
    worker();
    auto end = std::chrono::high_resolution_clock::now();
    return std::chrono::duration_cast<std::chrono::nanoseconds>(end-begin).count()/1e9;
}


 int main() {
     //set up the data in special way!
     std::vector<int> keys(SIZE);
     for (int i=0;i<SIZE;i++){
       keys[i] = i;
     }

     Worker<false> without_prefetch(keys);
     Worker<true> with_prefetch(keys);

     std::cout<<"#preloops\ttime no prefetch\ttime prefetch\tfactor\n";
     std::cout<<std::setprecision(17);

     for(int i=0;i<20;i++){
         //let oracle see i steps in the future:
         without_prefetch.oracle_offset=i;
         with_prefetch.oracle_offset=i;

         //calculate:
         double time_with_prefetch=timeit(with_prefetch);
         double time_no_prefetch=timeit(without_prefetch);

         std::cout<<i<<"\t"
                  <<time_no_prefetch<<"\t"
                  <<time_with_prefetch<<"\t"
                  <<(time_no_prefetch/time_with_prefetch)<<"\n";
     }

 }

int addrindex = &B[0];
    __builtin_prefetch(&V[addrindex]);
    __builtin_prefetch(&V[addrindex + 8]);
    __builtin_prefetch(&V[addrindex + 16]);
    __builtin_prefetch(&V[addrindex + 24]);

int addrindex = &B[0];
__builtin_prefetch((uint32x4x4_t *) &V[addrindex]);
__builtin_prefetch((uint32x4x4_t *) &V[addrindex + 16]);

uint32_t *V __attribute__((__aligned__(64))) = (uint32_t *)(((uintptr_t)(__builtin_assume_aligned((unsigned char*)aligned_alloc(64,size), 64)) + 63) & ~ (uintptr_t)(63));