Warning: file_get_contents(/data/phpspider/zhask/data//catemap/8/file/3.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Assembly 流固有特性会降低性能_Assembly_Vectorization_Sse_Intrinsics_Avx - Fatal编程技术网

Assembly 流固有特性会降低性能

Assembly 流固有特性会降低性能,assembly,vectorization,sse,intrinsics,avx,Assembly,Vectorization,Sse,Intrinsics,Avx,我在玩弄_mm_stream_ps的内在特性,在理解它的性能方面遇到了一些问题 下面是我正在处理的代码片段。。。 流版本: #include <stdio.h> #include <stdint.h> #include <stdlib.h> #include <omp.h> #include <immintrin.h> #define NUM_ELEMENTS 10000000L static void copy_temporal

我在玩弄_mm_stream_ps的内在特性,在理解它的性能方面遇到了一些问题

下面是我正在处理的代码片段。。。 流版本:

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <omp.h>

#include <immintrin.h>

#define NUM_ELEMENTS 10000000L

static void copy_temporal(float* restrict x, float* restrict y)
{
   for(uint64_t i = 0; i < NUM_ELEMENTS/2; ++i){
      _mm_store_ps(y,_mm_load_ps(x));
      _mm_store_ps(y+4,_mm_load_ps(x+4));
      x+=8;
      y+=8;
   }
}
static void copy_nontemporal(float* restrict x, float* restrict y)
{
   for(uint64_t i = 0; i < NUM_ELEMENTS/2; ++i){
      _mm_stream_ps(y,_mm_load_ps(x));
      _mm_stream_ps(y+4,_mm_load_ps(x+4));
      x+=8;
      y+=8;
   }
}

int main(int argc, char** argv)
{
   uint64_t sizeX = sizeof(float) * 4 * NUM_ELEMENTS;
   float *x = (float*) _mm_malloc(sizeX,32);
   float *y = (float*) _mm_malloc(sizeX,32);

   //initialization
   for(uint64_t i = 0 ; i < 4 * NUM_ELEMENTS; ++i){
      x[i] = (float)rand()/RAND_MAX;
      y[i] = 0;
   }

   printf("%g MB allocated\n",(2 * sizeX)/1024.0/1024.0); 

   double start = omp_get_wtime();
   copy_nontemporal(x, y);
   double time = omp_get_wtime() - start;
   printf("Bandwidth (non-temporal): %g GB/s\n",((3 * sizeX)/1024.0/1024.0/1024.0)/time);

   start = omp_get_wtime();
   copy_temporal(x, y);
   time = omp_get_wtime() - start;
   printf("Bandwidth: %g GB/s\n",((3 * sizeX)/1024.0/1024.0/1024.0)/time);

   _mm_free(x);
   _mm_free(y);

   return 0;
}
真正让我困惑的是,如果我使用不对齐的加载和存储(即storeu_ps/loadu_ps),我会在Xeon CPU(而不是笔记本电脑)上看到更好的性能:

我希望流版本比非流版本更快——因为y的冗余负载。然而,测量表明流版本实际上比非流版本慢两倍

你对此有什么解释吗

使用的编译器:英特尔14.0.1; 编译器标志:-O3-限制-xAVX; 使用的CPU:英特尔至强E5-2650


谢谢。

流变体创建直接到DRAM的流水线突发写入。速度应大致与DRAM的速度相匹配。标准存储写入缓存(但如果数据不在缓存中,则先将其读入缓存)。如果数据已经在缓存中,则标准存储以缓存写入的速度运行。通常,使用流方法时,大小远大于最后一级缓存大小的写入速度要快得多。使用标准存储时,小型写入通常更快。尝试使用几GB的缓冲区大小运行测试。stream方法应该更快

这里有一个基准来证明:

#define __USE_MINGW_ANSI_STDIO 1
#include <stdlib.h>
#include <intrin.h>
#include <windows.h>
#include <stdio.h>
#include <stdint.h>

//-----------------------------------------------------------------------------
//
//  queryPerformanceCounter - similar to QueryPerformanceCounter, but returns
//                            count directly.

uint64_t queryPerformanceCounter (void)
    {
    LARGE_INTEGER int64;
    QueryPerformanceCounter (&int64);
    return int64.QuadPart;
    }

//-----------------------------------------------------------------------------
//
// queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns  count direcly.

uint64_t queryPerformanceFrequency (void)
    {
    LARGE_INTEGER int64;

    QueryPerformanceFrequency (&int64);
    return int64.QuadPart;
    }

//---------------------------------------------------------------------------

static void testNontemporal (float *x, float *y, uint64_t numberOfVectors)
    {
    uint64_t i;
    for(i = 0; i < numberOfVectors / 2; ++i)
        {
        _mm_stream_ps(y,_mm_load_ps(x));
        _mm_stream_ps(y+4,_mm_load_ps(x+4));
        y+=8; x+=8;
        }
    }

//---------------------------------------------------------------------------

static void testTemporal (float *x, float *y, uint64_t numberOfVectors)
    {
    uint64_t i;
    for(i = 0; i < numberOfVectors / 2; ++i)
        {
        _mm_store_ps(y,_mm_load_ps(x));
        _mm_store_ps(y+4,_mm_load_ps(x+4));
        y+=8; x+=8;
        }
    }

//---------------------------------------------------------------------------

static void runtests (int nonTemporal)
    {
    uint64_t startCount, elapsed, index;
    float *x, *y;
    uint64_t numberOfBytes = 400 * 0x100000ull;
    uint64_t numberOfFloats = numberOfBytes / sizeof *x;
    uint64_t numberOfVectors = numberOfFloats / 4;
    double gbPerSecond;

    x = _mm_malloc (numberOfBytes, 32);
    y = _mm_malloc (numberOfBytes, 32);
    if (x == NULL || y == NULL) exit (1);

    // put valid floating point data into the source buffer
    // to avoid performance penalty
    for (index = 0; index < numberOfFloats; index++)
        x [index] = (float) index, y [index] = 0;

    startCount = queryPerformanceCounter ();
    if (nonTemporal)
        testNontemporal (x, y, numberOfVectors);
    else
        testTemporal (x, y, numberOfVectors);
    elapsed = queryPerformanceCounter () - startCount;
    gbPerSecond = (double) numberOfBytes / 0x40000000 * queryPerformanceFrequency () / elapsed;
    printf ("%.2f GB/s\n", gbPerSecond);
    _mm_free (x);
    _mm_free (y);
    }

//---------------------------------------------------------------------------

int main (void)
    {
    // raise our priority to increase measurement accuracy
    SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS);

    printf ("using temporal stores\n");
    runtests (0);
    printf ("using non-temporal stores\n");
    runtests (1);
    return 0;
    }

//---------------------------------------------------------------------------

另外,非临时存储从所有缓存中删除目标缓存线。如果在该行被自然删除之前再次触碰它,那么您将损失惨重。

正如ScottD所指出的,问题的答案在于生成的汇编代码。 显然,英特尔编译器足够智能,能够检测访问模式,并自动生成非临时加载,即使是临时版本

以下是编译器为临时版本生成的汇编代码的示例:

..___tag_value___Z13copy_temporalPfS_.35:                       #
        xor       edx, edx                                      #22.4
        xor       eax, eax                                      #
..B2.2:                         # Preds ..B2.2 ..B2.1
        vmovups   xmm0, XMMWORD PTR [rax+rdi]                   #23.34
        inc       rdx                                           #22.4
        vmovntps  XMMWORD PTR [rax+rsi], xmm0                   #23.20
        vmovups   xmm1, XMMWORD PTR [16+rax+rdi]                #24.36
        vmovntps  XMMWORD PTR [16+rax+rsi], xmm1                #24.20
        vmovups   xmm2, XMMWORD PTR [32+rax+rdi]                #23.34
        vmovntps  XMMWORD PTR [32+rax+rsi], xmm2                #23.20
        vmovups   xmm3, XMMWORD PTR [48+rax+rdi]                #24.36
        vmovntps  XMMWORD PTR [48+rax+rsi], xmm3                #24.20
        add       rax, 64                                       #22.4
        cmp       rdx, 5000000                                  #22.4
        jb        ..B2.2        # Prob 99%                      #22.4
仍然存在的问题如下:


为什么CPU E5-2650(见上文)的非对齐时间版本的性能优于非时间版本。我已经看过生成的汇编代码,编译器确实生成了vmovups指令(由于不存在对齐)

谢谢你的回复。我已经在使用大小为400MB的缓冲区(也就是说,比我系统中的任何缓存都大得多)。此外,为了读取一些硬件计数器,我对代码进行了检测,结果是决定性的(即使用stream_ps可以减少二级写入未命中)。但是,我仍然无法解释这两个版本之间的巨大性能差异。我将添加一个示例基准测试,试图展示大型缓冲区的非时间(流)优势。这是一个有点快和肮脏,但我认为是正确的。使用了不可移植的(Windows)计时功能。我已经更新了我的原始帖子,但我无法复制您的结果(即使我将您的代码移植到Linux,结果仍然相同)。你知道为什么会这样吗?此外,你对不结盟版本更快的原因有何解释?这实际上可能指向真正的问题,因为流确实需要对齐。@ScottD,我在MSVC中运行了您的测试。运行它们几次,结果就会前后交换。这两种方法在不确定性范围内具有相同的速度。@Z,您使用的是优化的发布版本还是调试版本?我得到了您在运行调试构建时描述的内容。但是发布版本(或命令行cl-Ox stream.c)的工作与预期的一样。我想说的是,这取决于所讨论的缓冲区的大小,但是您对ScottD的评论表明它们非常大。在这一点上,我不确定发生了什么。您可以尝试各种方法,如注释掉
#pragma
、在不使用
-xAVX
的情况下编译等,寻找常规存储和非临时存储之间性能比率的变化。无需展开循环。循环展开仅在依赖链中有用,并且没有依赖链。CPU可以帮你处理这个问题。不过我有个问题。带宽计算中的因子3是多少?两次读取+一次写入。尽管非时态版本只进行一次阅读,但为了简化比较,我保留了三个因子。如果ICC做了与你所说的不同的事情,那就令人失望了。我更希望它能实现你想要的内在目标。
#define __USE_MINGW_ANSI_STDIO 1
#include <stdlib.h>
#include <intrin.h>
#include <windows.h>
#include <stdio.h>
#include <stdint.h>

//-----------------------------------------------------------------------------
//
//  queryPerformanceCounter - similar to QueryPerformanceCounter, but returns
//                            count directly.

uint64_t queryPerformanceCounter (void)
    {
    LARGE_INTEGER int64;
    QueryPerformanceCounter (&int64);
    return int64.QuadPart;
    }

//-----------------------------------------------------------------------------
//
// queryPerformanceFrequency - same as QueryPerformanceFrequency, but returns  count direcly.

uint64_t queryPerformanceFrequency (void)
    {
    LARGE_INTEGER int64;

    QueryPerformanceFrequency (&int64);
    return int64.QuadPart;
    }

//---------------------------------------------------------------------------

static void testNontemporal (float *x, float *y, uint64_t numberOfVectors)
    {
    uint64_t i;
    for(i = 0; i < numberOfVectors / 2; ++i)
        {
        _mm_stream_ps(y,_mm_load_ps(x));
        _mm_stream_ps(y+4,_mm_load_ps(x+4));
        y+=8; x+=8;
        }
    }

//---------------------------------------------------------------------------

static void testTemporal (float *x, float *y, uint64_t numberOfVectors)
    {
    uint64_t i;
    for(i = 0; i < numberOfVectors / 2; ++i)
        {
        _mm_store_ps(y,_mm_load_ps(x));
        _mm_store_ps(y+4,_mm_load_ps(x+4));
        y+=8; x+=8;
        }
    }

//---------------------------------------------------------------------------

static void runtests (int nonTemporal)
    {
    uint64_t startCount, elapsed, index;
    float *x, *y;
    uint64_t numberOfBytes = 400 * 0x100000ull;
    uint64_t numberOfFloats = numberOfBytes / sizeof *x;
    uint64_t numberOfVectors = numberOfFloats / 4;
    double gbPerSecond;

    x = _mm_malloc (numberOfBytes, 32);
    y = _mm_malloc (numberOfBytes, 32);
    if (x == NULL || y == NULL) exit (1);

    // put valid floating point data into the source buffer
    // to avoid performance penalty
    for (index = 0; index < numberOfFloats; index++)
        x [index] = (float) index, y [index] = 0;

    startCount = queryPerformanceCounter ();
    if (nonTemporal)
        testNontemporal (x, y, numberOfVectors);
    else
        testTemporal (x, y, numberOfVectors);
    elapsed = queryPerformanceCounter () - startCount;
    gbPerSecond = (double) numberOfBytes / 0x40000000 * queryPerformanceFrequency () / elapsed;
    printf ("%.2f GB/s\n", gbPerSecond);
    _mm_free (x);
    _mm_free (y);
    }

//---------------------------------------------------------------------------

int main (void)
    {
    // raise our priority to increase measurement accuracy
    SetPriorityClass (GetCurrentProcess (), REALTIME_PRIORITY_CLASS);

    printf ("using temporal stores\n");
    runtests (0);
    printf ("using non-temporal stores\n");
    runtests (1);
    return 0;
    }

//---------------------------------------------------------------------------
using temporal stores
5.57 GB/s
using non-temporal stores
8.35 GB/s
..___tag_value___Z13copy_temporalPfS_.35:                       #
        xor       edx, edx                                      #22.4
        xor       eax, eax                                      #
..B2.2:                         # Preds ..B2.2 ..B2.1
        vmovups   xmm0, XMMWORD PTR [rax+rdi]                   #23.34
        inc       rdx                                           #22.4
        vmovntps  XMMWORD PTR [rax+rsi], xmm0                   #23.20
        vmovups   xmm1, XMMWORD PTR [16+rax+rdi]                #24.36
        vmovntps  XMMWORD PTR [16+rax+rsi], xmm1                #24.20
        vmovups   xmm2, XMMWORD PTR [32+rax+rdi]                #23.34
        vmovntps  XMMWORD PTR [32+rax+rsi], xmm2                #23.20
        vmovups   xmm3, XMMWORD PTR [48+rax+rdi]                #24.36
        vmovntps  XMMWORD PTR [48+rax+rsi], xmm3                #24.20
        add       rax, 64                                       #22.4
        cmp       rdx, 5000000                                  #22.4
        jb        ..B2.2        # Prob 99%                      #22.4