Cuda 这是注册延迟吗？_Cuda - Fatal编程技术网

Cuda 这是注册延迟吗？

cuda

Cuda 这是注册延迟吗？,cuda,Cuda,我正在GTX680上做一些性能CUDA测试，想知道是否有人能帮助我理解为什么我会得到以下性能结果。我正在运行的代码如下所示： #include <stdio.h> using namespace std; __global__ void test_hardcoded(int rec,int * output) { int a; int rec2=rec/2; if(threadIdx.x==1000) *output=rec; if(thread

我正在GTX680上做一些性能CUDA测试，想知道是否有人能帮助我理解为什么我会得到以下性能结果。我正在运行的代码如下所示：

#include <stdio.h>
using namespace std;


__global__ void test_hardcoded(int rec,int * output)
{

    int a;
    int rec2=rec/2;
    if(threadIdx.x==1000) *output=rec;
    if(threadIdx.x==1000) *(output+1)=rec2;

    for (int i=0;i<10000;i++)
    {
        __syncthreads();
        a+=i;
    }
    if(threadIdx.x==1000) *output=a;   //will never happen but should fool compiler as to not skip the for loop

}
__global__ void test_softcoded(int rec,int * output)
{
    int a;
    int rec2=rec/2; //This should ensure that we are using the a register not constant memory
    if(threadIdx.x==1000) *output=rec;
    if(threadIdx.x==1000) *(output+1)=rec2;

    for (int i=0;i<=rec2;i++)
    {    __syncthreads();
        a+=i;
    }
    if(threadIdx.x==1000) *output=a;   //will never happen but should fool compiler as to not skip the for loop

}

int main(int argc, char *argv[])
{
    float timestamp;
    cudaEvent_t event_start,event_stop;
    // Initialise
    cudaSetDevice(0);

    cudaEventCreate(&event_start);
    cudaEventCreate(&event_stop);
    cudaEventRecord(event_start, 0);
    dim3 threadsPerBlock;
    dim3 blocks;
    threadsPerBlock.x=32;
    threadsPerBlock.y=32;
    threadsPerBlock.z=1;
    blocks.x=1;
    blocks.y=1000;
    blocks.z=1;

    cudaEventRecord(event_start);
    test_hardcoded<<<blocks,threadsPerBlock,0>>>(10000,NULL);
    cudaEventRecord(event_stop, 0);
    cudaEventSynchronize(event_stop);
    cudaEventElapsedTime(&timestamp, event_start, event_stop);
    printf("test_hardcoded() took  %fms \n", timestamp);

    cudaEventRecord(event_start);
    test_softcoded<<<blocks,threadsPerBlock,0>>>(20000,NULL);
    cudaEventRecord(event_stop, 0);
    cudaEventSynchronize(event_stop);
    cudaEventElapsedTime(&timestamp, event_start, event_stop);
    printf("test_softcoded() took  %fms \n", timestamp);

}

test_hardcoded（）函数比test-softcoded（）快两倍

我知道在test_softcoded（）中存在潜在的写后读注册表依赖关系，但我的意识是注册表延迟完全隐藏，占用率高，应该非常高），因此我想知道可能存在什么问题，以及如何提高test_softcoded（）的性能.

由于这个硬编码的值，编译器可以进行一些优化，比如循环展开，这可能会在一定程度上提高性能。这可能就是原因

您可以通过在“test_softcoded”中为for循环添加一些展开来检查它，如

在（int i=0；i）之前添加类似“#pragma unroll 5000”的代码，您可以发布编译器为这两种情况生成的PTX吗？这可能是用户1695033指出的循环展开的结果，也可能是使用立即操作数的结果。

$ nvcc -arch=sm_30 test7.cu
$ ./a.out

test_hardcoded() took  51.353985ms 
test_softcoded() took  99.209694ms