Performance 如何从预取本质中获得可测量的好处?

Performance 如何从预取本质中获得可测量的好处?,performance,x86-64,sse,simd,prefetch,Performance,X86 64,Sse,Simd,Prefetch,在x86_64上使用GCC4.4.5(是的……我知道它很旧)。出于兼容性原因,仅限于SSE2(或更早版本)说明 我有一个我认为应该是教科书上的案例,可以从预取中获得巨大的好处。我有一个32位元素的数组(“A”),它们不是(也不能)按顺序排列的。这些32位元素是一个较大的数据数组(“D”)的索引,该数组由_m128i数据组成。对于“A”的每个元素,我需要从“D”中的适当位置获取_m128i数据,对其执行操作,并将其存储回“D”中的相同位置。实际上,D中的每个“条目”都是“某些常量”\uuuum12

在x86_64上使用GCC4.4.5(是的……我知道它很旧)。出于兼容性原因,仅限于SSE2(或更早版本)说明

我有一个我认为应该是教科书上的案例,可以从预取中获得巨大的好处。我有一个32位元素的数组(“A”),它们不是(也不能)按顺序排列的。这些32位元素是一个较大的数据数组(“D”)的索引,该数组由_m128i数据组成。对于“A”的每个元素,我需要从“D”中的适当位置获取_m128i数据,对其执行操作,并将其存储回“D”中的相同位置。实际上,D中的每个“条目”都是“某些常量”\uuuum128i很大。因此,如果A中的值为“1”,则D中的索引为D[1*SOME_CONST]

由于“A”中的连续元素几乎永远不会指向“D”中的连续位置,因此我倾向于认为硬件预取器将难以完成或无法完成任何有用的任务

然而,我可以很容易地预测下一步将访问哪些位置,只需在“A”中展望未来。够多的废话。。。这里有一些代码。我对数据执行的操作是获取_m128i的低64位并将其克隆到同一数据的高64位。首先是基本循环,没有虚饰

// SOME_CONST is either 3 or 4, but this "operation" only needs to happen for 3

for ( i=0; i<arraySize; ++i )
{
  register __m128i *dPtr = D + (A[i] * SOME_CONST);
  dPtr[0] = _mm_shuffle_epi32( dPtr[0], 0 | (1<<2) | (0<<4) | (1<<6) );
  dPtr[1] = _mm_shuffle_epi32( dPtr[1], 0 | (1<<2) | (0<<4) | (1<<6) );
  dPtr[2] = _mm_shuffle_epi32( dPtr[2], 0 | (1<<2) | (0<<4) | (1<<6) );

  // The immediate operand selects:
  // Bits 0-31   = bits 0-31
  // Bits 32-63  = bits 32-63
  // Bits 64-95  = bits 0-31
  // Bits 96-127 = bits 32-63

  // If anyone is more clever than me and knows of a better way to do this in SSE2,
  //  bonus points.  ;-)
}
//某些常数为3或4,但此“操作”只需在3个周期内执行
对于(i=0;i10x),如果我尝试预取我预先计算的任何地址。我真的对这一点摸不着头脑。我的独立人工代码是:

#include <xmmintrin.h>
#include <stdlib.h>
#include <time.h>
#include <stdio.h>

#define QUEUE_ELEMENTS    1048576
#define DATA_ELEMENT_SIZE 4 * sizeof( __m128i )
#define DATA_ELEMENTS     QUEUE_ELEMENTS

#define QUEUE_ITERATIONS  100000
#define LOOP_UNROLL_4
#define LOOP_UNROLL_2

#ifdef LOOP_UNROLL_4
  #define UNROLL_CONST 4
#else
  #ifdef LOOP_UNROLL_2
    #define UNROLL_CONST 2
  #else
    #define UNROLL_CONST 1
  #endif
#endif

int main( void )
{
  unsigned long long randTemp;
  unsigned long i, outerLoop;
  unsigned long *workQueue;
  __m128i *data, *dataOrig;
  clock_t timeStamp;

  workQueue = malloc( QUEUE_ELEMENTS * sizeof( unsigned long ) );

  dataOrig = malloc( (DATA_ELEMENTS * DATA_ELEMENT_SIZE) + 2 );
  if ( (unsigned long long) dataOrig & 0xf )
  {
    data = (__m128i *) (((unsigned long long) dataOrig & ~0xf) + 0x10);
    // force 16-byte (128-bit) alignment
  } else data = dataOrig;

  // Not initializing data, because its contents isn't important.

  for ( i=0; i<QUEUE_ELEMENTS; ++i )
  {
    randTemp = (unsigned long long)rand() *
     (unsigned long long) QUEUE_ELEMENTS / (unsigned long long) RAND_MAX;
    workQueue[i] = (unsigned long) randTemp;
  }

  printf( "Starting work...\n" );
  // Actual work happening below... start counting.
  timeStamp = clock();

  for ( outerLoop = 0; outerLoop < QUEUE_ITERATIONS; ++outerLoop )
  {
    register __m128i *dataPtr0, *dataPtr1, *dataPtr2, *dataPtr3;
    register __m128i *dataPtr4, *dataPtr5, *dataPtr6, *dataPtr7;

    #ifdef LOOP_UNROLL_2
      dataPtr4 = data + (workQueue[0] * DATA_ELEMENT_SIZE);
      dataPtr5 = data + (workQueue[1] * DATA_ELEMENT_SIZE);
    #endif
    #ifdef LOOP_UNROLL_4
      dataPtr6 = data + (workQueue[2] * DATA_ELEMENT_SIZE);
      dataPtr7 = data + (workQueue[3] * DATA_ELEMENT_SIZE);
    #endif

    for ( i=0; i<QUEUE_ELEMENTS; i+=UNROLL_CONST )
    {
      #ifdef LOOP_UNROLL_2
        dataPtr0 = dataPtr4;
        dataPtr4 = data + (workQueue[i+4] * DATA_ELEMENT_SIZE);
        // _mm_prefetch( dataPtr4, _MM_HINT_T0 );
        dataPtr1 = dataPtr5;
        dataPtr5 = data + (workQueue[i+5] * DATA_ELEMENT_SIZE);
        // _mm_prefetch( dataPtr5, _MM_HINT_T0 );
      #endif
      #ifdef LOOP_UNROLL_4
        dataPtr2 = dataPtr6;
        dataPtr6 = data + (workQueue[i+6] * DATA_ELEMENT_SIZE);
        // _mm_prefetch( dataPtr6, _MM_HINT_T0 );
        dataPtr3 = dataPtr7;
        dataPtr7 = data + (workQueue[i+7] * DATA_ELEMENT_SIZE);
        // _mm_prefetch( dataPtr7, _MM_HINT_T0 );
      #endif
      #if !defined( LOOP_UNROLL_2 ) && !defined( LOOP_UNROLL_4 )
        dataPtr0 = data + (workQueue[i] * DATA_ELEMENT_SIZE);
      #endif

      _mm_shuffle_epi32( dataPtr0[0], 0 | (1<<2) | (0<<4) | (1<<6) );
      _mm_shuffle_epi32( dataPtr0[1], 0 | (1<<2) | (0<<4) | (1<<6) );
      _mm_shuffle_epi32( dataPtr0[2], 0 | (1<<2) | (0<<4) | (1<<6) );
      // Per original code, no need to perform operation on dataPtrx[3]

      #ifdef LOOP_UNROLL_2
        _mm_shuffle_epi32( dataPtr1[0], 0 | (1<<2) | (0<<4) | (1<<6) );
        _mm_shuffle_epi32( dataPtr1[1], 0 | (1<<2) | (0<<4) | (1<<6) );
        _mm_shuffle_epi32( dataPtr1[2], 0 | (1<<2) | (0<<4) | (1<<6) );
      #endif
      #ifdef LOOP_UNROLL_4
        _mm_shuffle_epi32( dataPtr2[0], 0 | (1<<2) | (0<<4) | (1<<6) );  
        _mm_shuffle_epi32( dataPtr2[1], 0 | (1<<2) | (0<<4) | (1<<6) );  
        _mm_shuffle_epi32( dataPtr2[2], 0 | (1<<2) | (0<<4) | (1<<6) );  
        _mm_shuffle_epi32( dataPtr3[0], 0 | (1<<2) | (0<<4) | (1<<6) );  
        _mm_shuffle_epi32( dataPtr3[1], 0 | (1<<2) | (0<<4) | (1<<6) );  
        _mm_shuffle_epi32( dataPtr3[2], 0 | (1<<2) | (0<<4) | (1<<6) );  
      #endif
    }
    if ( (outerLoop % 1000) == 0 ) { putchar( '.' ); fflush( stdout ); }
  }

  timeStamp = clock() - timeStamp;
  printf( "\nRun was %lu seconds.\n", timeStamp / CLOCKS_PER_SEC );

  free( dataOrig );
  free( workQueue );

  return 0;
}
#包括
#包括
#包括
#包括
#定义队列元素1048576
#定义数据元素大小4*sizeof(\uuuum128i)
#定义数据元素队列元素
#定义队列\u迭代次数
#定义循环\u展开\u 4
#定义循环\u展开\u 2
#ifdef循环\u展开\u 4
#定义展开常数4
#否则
#ifdef循环\u展开\u 2
#定义展开常数2
#否则
#定义展开常数1
#恩迪夫
#恩迪夫
内部主(空)
{
无符号长随机数;
无符号长i,外环;
无符号长*工作队列;
__m128i*数据,*数据源;
时钟时间戳;
workQueue=malloc(QUEUE_ELEMENTS*sizeof(unsigned long));
dataOrig=malloc((数据元素*数据元素大小)+2);
if((无符号长)数据源(&0xf)
{
数据=((无符号长-长)数据源&~0xf)+0x10);
//强制16字节(128位)对齐
}else数据=数据源;
//不初始化数据,因为其内容不重要。

对于(i=0;i如果数据驻留在内存中,则不要期望太多的加速;从内存中预取的可用性很低


具有150 ns的循环时间、64字节缓存线和4GB/s的流式传输速率(我的AMD系统;英特尔更快),使用48字节(3 x 128位)在每次64字节缓存线读取中,系统每秒获取320 MB可用数据。预取使速率接近4000 MB/s的峰值。每读取320 MB,预取可节省的总时间为.92秒。在320 MB/s时,可节省270秒(4m 30s)相当于840 GB的内存传输时间;progran可能不会花费超过这一时间的一小部分(一般来说,除了在一些非常特殊的情况下,显式预取在现代CPU上对您没有帮助,即使是这样,要正确地进行预取也是非常困难的。如果您错误地进行预取,它甚至可能弊大于利。另外,请注意,还有许多非常类似的预取(可能是重复的)关于这方面的问题和答案,你可能已经想看了。参见,例如,理解,Paul。我在这里搜索了这些问题,大部分答案是“不要这样做,因为硬件会做得更好”(在这些情况下,这样做是正确的)。但我想理解,考虑到公平竞争,为什么这个特定案例不能受益“随意看”(但完全是人类的预测)它需要在更大的阵列中进行访问。我肯定硬件本身无法理解这种访问模式。数据转置是我从手动预取中获得明显加速的唯一机会之一:你断言硬件无法理解访问模式,但你有强有力的证据证明它可以。你真的不知道吗eed确认您的假设是错误的吗?
arraySize
有多大?我怀疑预取与您的代码有什么区别,您需要使用非常大的数据集(几GB)。否则预取的效果可能会被缓存隐藏。
#define PREFETCH_DISTANCE 10
// trying 5 overnight, will see results tomorrow...

for ( i=0; i<arraySize; ++i )
{
  register __m128i *dPtrFuture, *dPtr;
  dPtrFuture = D + (A[i + PREFETCH_DISTANCE] * SOME_CONST);
  _mm_prefetch( dPtrFuture, _MM_HINT_NTA );      // tried _MM_HINT_T0 too
  _mm_prefetch( dPtrFuture + 1, _MM_HINT_NTA );  // tried _MM_HINT_T0 too

  dPtr = D + (A[i] * SOME_CONST);
  dPtr[0] = _mm_shuffle_epi32( dPtr[0], 0 | (1<<2) | (0<<4) | (1<<6) );
  dPtr[1] = _mm_shuffle_epi32( dPtr[1], 0 | (1<<2) | (0<<4) | (1<<6) );
  dPtr[2] = _mm_shuffle_epi32( dPtr[2], 0 | (1<<2) | (0<<4) | (1<<6) );
}
#include <xmmintrin.h>
#include <stdlib.h>
#include <time.h>
#include <stdio.h>

#define QUEUE_ELEMENTS    1048576
#define DATA_ELEMENT_SIZE 4 * sizeof( __m128i )
#define DATA_ELEMENTS     QUEUE_ELEMENTS

#define QUEUE_ITERATIONS  100000
#define LOOP_UNROLL_4
#define LOOP_UNROLL_2

#ifdef LOOP_UNROLL_4
  #define UNROLL_CONST 4
#else
  #ifdef LOOP_UNROLL_2
    #define UNROLL_CONST 2
  #else
    #define UNROLL_CONST 1
  #endif
#endif

int main( void )
{
  unsigned long long randTemp;
  unsigned long i, outerLoop;
  unsigned long *workQueue;
  __m128i *data, *dataOrig;
  clock_t timeStamp;

  workQueue = malloc( QUEUE_ELEMENTS * sizeof( unsigned long ) );

  dataOrig = malloc( (DATA_ELEMENTS * DATA_ELEMENT_SIZE) + 2 );
  if ( (unsigned long long) dataOrig & 0xf )
  {
    data = (__m128i *) (((unsigned long long) dataOrig & ~0xf) + 0x10);
    // force 16-byte (128-bit) alignment
  } else data = dataOrig;

  // Not initializing data, because its contents isn't important.

  for ( i=0; i<QUEUE_ELEMENTS; ++i )
  {
    randTemp = (unsigned long long)rand() *
     (unsigned long long) QUEUE_ELEMENTS / (unsigned long long) RAND_MAX;
    workQueue[i] = (unsigned long) randTemp;
  }

  printf( "Starting work...\n" );
  // Actual work happening below... start counting.
  timeStamp = clock();

  for ( outerLoop = 0; outerLoop < QUEUE_ITERATIONS; ++outerLoop )
  {
    register __m128i *dataPtr0, *dataPtr1, *dataPtr2, *dataPtr3;
    register __m128i *dataPtr4, *dataPtr5, *dataPtr6, *dataPtr7;

    #ifdef LOOP_UNROLL_2
      dataPtr4 = data + (workQueue[0] * DATA_ELEMENT_SIZE);
      dataPtr5 = data + (workQueue[1] * DATA_ELEMENT_SIZE);
    #endif
    #ifdef LOOP_UNROLL_4
      dataPtr6 = data + (workQueue[2] * DATA_ELEMENT_SIZE);
      dataPtr7 = data + (workQueue[3] * DATA_ELEMENT_SIZE);
    #endif

    for ( i=0; i<QUEUE_ELEMENTS; i+=UNROLL_CONST )
    {
      #ifdef LOOP_UNROLL_2
        dataPtr0 = dataPtr4;
        dataPtr4 = data + (workQueue[i+4] * DATA_ELEMENT_SIZE);
        // _mm_prefetch( dataPtr4, _MM_HINT_T0 );
        dataPtr1 = dataPtr5;
        dataPtr5 = data + (workQueue[i+5] * DATA_ELEMENT_SIZE);
        // _mm_prefetch( dataPtr5, _MM_HINT_T0 );
      #endif
      #ifdef LOOP_UNROLL_4
        dataPtr2 = dataPtr6;
        dataPtr6 = data + (workQueue[i+6] * DATA_ELEMENT_SIZE);
        // _mm_prefetch( dataPtr6, _MM_HINT_T0 );
        dataPtr3 = dataPtr7;
        dataPtr7 = data + (workQueue[i+7] * DATA_ELEMENT_SIZE);
        // _mm_prefetch( dataPtr7, _MM_HINT_T0 );
      #endif
      #if !defined( LOOP_UNROLL_2 ) && !defined( LOOP_UNROLL_4 )
        dataPtr0 = data + (workQueue[i] * DATA_ELEMENT_SIZE);
      #endif

      _mm_shuffle_epi32( dataPtr0[0], 0 | (1<<2) | (0<<4) | (1<<6) );
      _mm_shuffle_epi32( dataPtr0[1], 0 | (1<<2) | (0<<4) | (1<<6) );
      _mm_shuffle_epi32( dataPtr0[2], 0 | (1<<2) | (0<<4) | (1<<6) );
      // Per original code, no need to perform operation on dataPtrx[3]

      #ifdef LOOP_UNROLL_2
        _mm_shuffle_epi32( dataPtr1[0], 0 | (1<<2) | (0<<4) | (1<<6) );
        _mm_shuffle_epi32( dataPtr1[1], 0 | (1<<2) | (0<<4) | (1<<6) );
        _mm_shuffle_epi32( dataPtr1[2], 0 | (1<<2) | (0<<4) | (1<<6) );
      #endif
      #ifdef LOOP_UNROLL_4
        _mm_shuffle_epi32( dataPtr2[0], 0 | (1<<2) | (0<<4) | (1<<6) );  
        _mm_shuffle_epi32( dataPtr2[1], 0 | (1<<2) | (0<<4) | (1<<6) );  
        _mm_shuffle_epi32( dataPtr2[2], 0 | (1<<2) | (0<<4) | (1<<6) );  
        _mm_shuffle_epi32( dataPtr3[0], 0 | (1<<2) | (0<<4) | (1<<6) );  
        _mm_shuffle_epi32( dataPtr3[1], 0 | (1<<2) | (0<<4) | (1<<6) );  
        _mm_shuffle_epi32( dataPtr3[2], 0 | (1<<2) | (0<<4) | (1<<6) );  
      #endif
    }
    if ( (outerLoop % 1000) == 0 ) { putchar( '.' ); fflush( stdout ); }
  }

  timeStamp = clock() - timeStamp;
  printf( "\nRun was %lu seconds.\n", timeStamp / CLOCKS_PER_SEC );

  free( dataOrig );
  free( workQueue );

  return 0;
}