Performance CUDA性能怀疑_Performance_Cuda_Bandwidth_Gpu

Performance CUDA性能怀疑

performance cuda

Performance CUDA性能怀疑,performance,cuda,bandwidth,gpu,Performance,Cuda,Bandwidth,Gpu,由于我没有收到CUDA论坛的回复，我将在这里尝试：在CUDA做了一些程序之后，我现在开始获得它们的有效带宽。然而，我有一些奇怪的结果，例如在下面的代码中，我可以对向量中的所有元素求和（不考虑维度），使用展开代码和“正常”代码的带宽似乎具有相同的中值结果（大约3000 Gb/s）我不知道我是否做错了什么（好吧，程序运行得很好），但从我目前所读到的来看，展开代码应该有更高的带宽 #include <stdio.h> #include <limits.h> #include

由于我没有收到CUDA论坛的回复，我将在这里尝试：

在CUDA做了一些程序之后，我现在开始获得它们的有效带宽。然而，我有一些奇怪的结果，例如在下面的代码中，我可以对向量中的所有元素求和（不考虑维度），使用展开代码和“正常”代码的带宽似乎具有相同的中值结果（大约3000 Gb/s）我不知道我是否做错了什么（好吧，程序运行得很好），但从我目前所读到的来看，展开代码应该有更高的带宽

#include <stdio.h>
#include <limits.h>
#include <stdlib.h>
#include <math.h>
#define elements 1000
#define blocksize 16    


__global__ void vecsumkernel(float*input, float*output,int nelements){



    __shared__ float psum[blocksize];
    int tid=threadIdx.x;

    if(tid + blockDim.x * blockIdx.x < nelements)
    psum[tid]=input[tid+blockDim.x*blockIdx.x];
    else
    psum[tid]=0.0f;
    __syncthreads();

    //WITHOUT UNROLL

    int stride;     
    for(stride=blockDim.x/2;stride>0;stride>>=1){
            if(tid<stride)
                    psum[tid]+=psum[tid+stride];
    __syncthreads();
    }
    if(tid==0)
            output[blockIdx.x]=psum[0];


    //WITH UNROLL
 /*
    if(blocksize>=512 && tid<256) psum[tid]+=psum[tid+256];__syncthreads();
    if(blocksize>=256 && tid<128) psum[tid]+=psum[tid+128];__syncthreads();
    if(blocksize>=128 && tid<64) psum[tid]+=psum[tid+64];__syncthreads();


    if (tid < 32) {
            if (blocksize >= 64) psum[tid] += psum[tid + 32];
            if (blocksize >= 32) psum[tid] += psum[tid + 16];
            if (blocksize >= 16) psum[tid] += psum[tid + 8];
            if (blocksize >=  8) psum[tid] += psum[tid + 4];
            if (blocksize >=  4) psum[tid] += psum[tid + 2];
            if (blocksize >=  2) psum[tid] += psum[tid + 1];
    }*/

    if(tid==0)
            output[blockIdx.x]=psum[0];



}

void vecsumv2(float*input, float*output, int nelements){
    dim3 dimBlock(blocksize,1,1);
    int i;

    for(i=((int)ceil((double)(nelements)/(double)blocksize))*blocksize;i>1;i(int)ceil((double)i/(double)blocksize)){
            dim3 dimGrid((int)ceil((double)i/(double)blocksize),1,1);
            printf("\ni=%d\ndimgrid=%u\n ",i,dimGrid.x);

            vecsumkernel<<<dimGrid,dimBlock>>>(i==((int)ceil((double)(nelements)/(double)blocksize))*blocksize ?input:output,output,i==((int)ceil((double)(nelements)/(double)blocksize))*blocksize ? elements:i);
    }

 }

 void printVec(float*vec,int dim){
    printf("\n{");
    for(int i=0;i<dim;i++)
            printf("%f ",vec[i]);
    printf("}\n");
 }

 int main(){
    cudaEvent_t evstart, evstop;
    cudaEventCreate(&evstart);
    cudaEventCreate(&evstop);


    float*input=(float*)malloc(sizeof(float)*(elements));
    for(int i=0;i<elements;i++)
            input[i]=(float) i;


    float*output=(float*)malloc(sizeof(float)*elements);



    float *input_d,*output_d;

    cudaMalloc((void**)&input_d,elements*sizeof(float));

    cudaMalloc((void**)&output_d,elements*sizeof(float));



    cudaMemcpy(input_d,input,elements*sizeof(float),cudaMemcpyHostToDevice);


    cudaEventRecord(evstart,0);

    vecsumv2(input_d,output_d,elements);

    cudaEventRecord(evstop,0);
    cudaEventSynchronize(evstop);
    float time;
    cudaEventElapsedTime(&time,evstart,evstop);
    printf("\ntempo gasto:%f\n",time);
    float Bandwidth=((1000*4*2)/10^9)/time;
    printf("\n Bandwidth:%f Gb/s\n",Bandwidth);


    cudaMemcpy(output,output_d,elements*sizeof(float),cudaMemcpyDeviceToHost);


    cudaFree(input_d);
    cudaFree(output_d);
    printf("soma do vector");
    printVec(output,4);



   }

#包括
#包括
#包括
#包括
#定义元素1000
#定义块大小16
__全局_uu; void vecsumkernel（浮点*输入、浮点*输出、整数元素）{
__共享浮点psum[块大小]；
int tid=threadIdx.x；
if（tid+BLOCKIM.x*blockIdx.x0；步幅>>=1）{
如果（tid=512&&tid=256&&tid=128&&tid=64）psum[tid]+=psum[tid+32]；
如果（块大小>=32）psum[tid]+=psum[tid+16]；
如果（块大小>=16）psum[tid]+=psum[tid+8]；
如果（块大小>=8）psum[tid]+=psum[tid+4]；
如果（块大小>=4）psum[tid]+=psum[tid+2]；
如果（块大小>=2）psum[tid]+=psum[tid+1]；
}*/
如果（tid==0）
输出[blockIdx.x]=psum[0]；
}
void vecsumv2（浮点*输入、浮点*输出、整数元素）{
dim3 dimBlock（块大小，1,1）；
int i；
对于（i=（（int）cell（（double）（neelements）/（double）blocksize））*blocksize；i>1；i（int）cell（（double）i/（double）blocksize））{
dim3 dimGrid（（int）ceil（（双）i/（双）块大小），1,1；
printf（“\ni=%d\ndimgrid=%u\n”，i，dimGrid.x）；
vecsumkernel（i=（（int）cell（（double）（neelements）/（double）blocksize））*blocksize？输入：输出，i=（（int）cell（（double）（neelements）/（double）blocksize））*blocksize？元素：i）；
}
}
无效打印向量（浮点*向量，整数尺寸）{
printf（“\n{”）；
对于（int i=0；i展开的代码中有很多分支。我计算了另外十个分支。通常在GPU上的一个扭曲内进行分支是昂贵的，因为扭曲中的所有线程最终都会等待分支（分歧）
有关扭曲散度的详细信息，请参见此处：

您是否尝试使用探查器查看发生了什么情况？
3000 Gb/s没有意义。PCIe在每个方向上的最大总线速度为8Gb/s
看看这篇文章，了解如何加快实现。
也考虑到这个库已经在模块
中实现了，你的未展开代码无效。对于<代码> Strud我看到你在内核中做减法和。这是一个很好的NVIDIA优化GPU的方法。你会发现，相同的代码，它的吞吐量超过2 Gb/s，被优化到63 Gb/s 在本指南中。
我从Nvidia Reduce.pdf中获取了展开代码。我不知道探查器的用途是什么（我一个月前才开始与cuda合作，计算机工程/编程不是我的领域）。此外，如何计算10个额外分支？我正在计算“如果”s、 你不是每次都做10次，但是有很多可能的代码路径，每个分支都可能导致扭曲发散。我认为这个值非常高，但我发现带宽是用我在CUDA编程指南上看到的公式计算的：float带宽=（（1000*4*2）/10^9）/time；//BW=（Bytesread+bytesrite）/10 ^ 9／时间与展开代码，IM检查我的代码与英伟达还原.PDF，我认为代码是相同的= /我相信你说的是真的看了看代码我不明白为什么有些线程不进入for循环。我看到只有一些线程（tid我明白你说的，如果步幅是ex 16，tid从0到16进入for，而其他人没有（我想这就是你的意思）。我认为分支冬青在if条件下发生，但我不知道这是否有问题。我知道的是，它应该效率较低