Performance Cuda的全局内存写入速度非常慢我现在正在编写一个代码，它使用英伟达推力库计算GPU上的积分直方图。< /P>_Performance_Memory_Cuda_Thrust

Performance Cuda的全局内存写入速度非常慢我现在正在编写一个代码，它使用英伟达推力库计算GPU上的积分直方图。< /P>

performance memory cuda

Performance Cuda的全局内存写入速度非常慢我现在正在编写一个代码，它使用英伟达推力库计算GPU上的积分直方图。< /P>,performance,memory,cuda,thrust,Performance,Memory,Cuda,Thrust,因此，我分配了一个连续的设备内存块，我一直用自定义函子更新它问题是，写入设备内存的速度非常慢，但读取实际上是正常的基本设置如下所示： struct HistogramCreation { HistogramCreation( ... // pointer to memory ... ){} /// The actual summation operator __device__ void operator()(int index){

因此，我分配了一个连续的设备内存块，我一直用自定义函子更新它

问题是，写入设备内存的速度非常慢，但读取实际上是正常的

基本设置如下所示：

struct HistogramCreation
{
    HistogramCreation(
    ...
    // pointer to memory
    ...
    ){}

    /// The actual summation operator
    __device__ void operator()(int index){
       .. do the calculations ..
       for(int j=0;j<30;j++){

       (1)  *_memoryPointer =  values (also using reads to such locations) ;

       }
  }
}

void foo(){

  cudaMalloc(_pointer,size);

  HistogramCreation initialCreation( ... _pointer ...);
  thrust::for_each(
    thrust::make_counting_iterator(0),
    thrust::make_counting_iterator(_imageSize),
    initialCreation);
}

性能要好得多。这是我唯一的全局内存写入

使用内存写入，我得到大约2秒的高清视频。使用局部变量大约需要50毫秒，因此大约少了40倍

为什么这么慢？如何改进它？

在编写GPU代码时，应避免读写全局内存。GPU上的全局内存非常慢。这就是硬件特性。你唯一能做的就是在全局内存中的相邻地址中读/写相邻的踏板。这将导致凝聚并加快该过程。但一般来说，数据读取一次，处理一次，然后写出一次。

在编写GPU代码时，应该避免读写全局内存。GPU上的全局内存非常慢。这就是硬件特性。你唯一能做的就是在全局内存中的相邻地址中读/写相邻的踏板。这将导致凝聚并加快该过程。但一般来说，数据读取一次，处理一次，然后写出一次。

正如@Olegtiov所说，频繁地加载/存储全球数据应尽可能避免使用内存。当有一个这是不可避免的情况，然后合并了记忆访问可以帮助执行过程不会变得太慢；然而，在大多数情况下，直方图计算是相当困难的实现联合接入

而上述大部分内容基本上只是重复 @Olegtiov的回答，我只想分享一个我做了一个调查，关于找到NVIDIA的总和库达。事实上，结果很有趣，我希望这将是其他xcuda开发人员的有用信息

这个实验基本上是为了进行一个速度测试各种内存访问模式的求和：使用全局内存（1个线程）、二级缓存（原子操作-128个线程）和一级缓存（共享内存-128个线程）

本实验采用：开普勒GTX 680， 1546芯@1.06GHz GDDR5 256位@3GHz

以下是内核：

__global__
void glob(float *h) {
    float* hist = h;
    uint sd = SEEDRND;
    uint random;
    for (int i = 0; i < NUMLOOP; i++) {
        if (i%NTHREADS==0) random = rnd(sd);
        int rind = random % NBIN;
        float randval = (float)(random % 10)*1.0f ;
        hist[rind] += randval;
    }
}

__global__
void atom(float *h) {
    float* hist = h;
    uint sd = SEEDRND;
    for (int i = threadIdx.x; i < NUMLOOP; i+=NTHREADS) {
        uint random = rnd(sd);
        int rind = random % NBIN;
    float randval = (float)(random % 10)*1.0f ;
        atomicAdd(&hist[rind], randval);
    }
}

__global__
void shm(float *h) {
    int lid = threadIdx.x;
    uint sd = SEEDRND;

    __shared__ float shm[NTHREADS][NBIN];
    for (int i = 0; i < NBIN; i++) shm[lid][i] = h[i];

    for (int i = lid; i < NUMLOOP; i+=NTHREADS) {
        uint random = rnd(sd);
        int rind = random % NBIN;
        float randval = (float)(random % 10)*1.0f ;
        shm[lid][rind] += randval;
    }

    /* reduction here */
    for (int i = 0; i < NBIN; i++) {
        __syncthreads();
        if (threadIdx.x < 64) {
            shm[threadIdx.x][i] += shm[threadIdx.x+64][i];
        }
        __syncthreads();
        if (threadIdx.x < 32) {
            shm[threadIdx.x][i] += shm[threadIdx.x+32][i];
        }
        __syncthreads();
        if (threadIdx.x < 16) {
            shm[threadIdx.x][i] += shm[threadIdx.x+16][i];
        }
        __syncthreads();
        if (threadIdx.x < 8) {
            shm[threadIdx.x][i] += shm[threadIdx.x+8][i];
        }
        __syncthreads();
        if (threadIdx.x < 4) {
            shm[threadIdx.x][i] += shm[threadIdx.x+4][i];
        }
        __syncthreads();
        if (threadIdx.x < 2) {
            shm[threadIdx.x][i] += shm[threadIdx.x+2][i];
        }
        __syncthreads();
        if (threadIdx.x == 0) {
            shm[0][i] += shm[1][i];
        }
    }

    for (int i = 0; i < NBIN; i++) h[i] = shm[0][i];
}

这些果仁之间的比例是57:17:1。很多事情都可以在这里进行分析，这并不意味着使用 L1或L2内存空间将始终提供10个以上整个程序的加速倍数

以下是主要功能和其他功能：

#include <iostream>
#include <cstdlib>
#include <cstdio>
using namespace std;

#define NUMLOOP 1000000
#define NBIN 36
#define SEEDRND 1

#define NTHREADS 128
#define NBLOCKS 1

__device__ uint rnd(uint & seed) {
#if LONG_MAX > (16807*2147483647)
    int const a    = 16807;
    int const m    = 2147483647;
    seed = (long(seed * a))%m;
    return seed;
#else
    double const a    = 16807;
    double const m    = 2147483647;

    double temp = seed * a;
    seed = (int) (temp - m * floor(temp/m));
    return seed;
#endif
}

... the above kernels ...

int main()
{
    float *h_hist, *h_hist2, *h_hist3, *d_hist, *d_hist2,
    *d_hist3;
    h_hist = (float*)malloc(NBIN * sizeof(float));
    h_hist2 = (float*)malloc(NBIN * sizeof(float));
    h_hist3 = (float*)malloc(NBIN * sizeof(float));
    cudaMalloc((void**)&d_hist, NBIN * sizeof(float));
    cudaMalloc((void**)&d_hist2, NBIN * sizeof(float));
    cudaMalloc((void**)&d_hist3, NBIN * sizeof(float));

    for (int i = 0; i < NBIN; i++) h_hist[i] = 0.0f;
    cudaMemcpy(d_hist, h_hist, NBIN * sizeof(float),
    cudaMemcpyHostToDevice);
    cudaMemcpy(d_hist2, h_hist, NBIN * sizeof(float),
    cudaMemcpyHostToDevice);
    cudaMemcpy(d_hist3, h_hist, NBIN * sizeof(float),
    cudaMemcpyHostToDevice);

    cudaEvent_t start, end;
    float elapsed = 0, elapsed2 = 0, elapsed3;
    cudaEventCreate(&start);
    cudaEventCreate(&end);

    cudaEventRecord(start, 0);

    atom<<<NBLOCKS, NTHREADS>>>(d_hist);
    cudaThreadSynchronize();

    cudaEventRecord(end, 0);
    cudaEventSynchronize(start);
    cudaEventSynchronize(end);
    cudaEventElapsedTime(&elapsed, start, end);

    cudaEventRecord(start, 0);

    shm<<<NBLOCKS, NTHREADS>>>(d_hist2);
    cudaThreadSynchronize();

    cudaEventRecord(end, 0);
    cudaEventSynchronize(start);
    cudaEventSynchronize(end);
    cudaEventElapsedTime(&elapsed2, start, end);

    cudaEventRecord(start, 0);

    glob<<<1, 1>>>(d_hist3);
    cudaThreadSynchronize();

    cudaEventRecord(end, 0);
    cudaEventSynchronize(start);
    cudaEventSynchronize(end);
    cudaEventElapsedTime(&elapsed3, start, end);

    cudaMemcpy(h_hist, d_hist, NBIN * sizeof(float),
    cudaMemcpyDeviceToHost);
    cudaMemcpy(h_hist2, d_hist2, NBIN * sizeof(float),
    cudaMemcpyDeviceToHost);
    cudaMemcpy(h_hist3, d_hist3, NBIN * sizeof(float),
    cudaMemcpyDeviceToHost);

    /* print output */
    for (int i = 0; i < NBIN; i++) {
        printf("atom: %10.2f shm: %10.2f glob:
    %10.2f¥n",h_hist[i],h_hist2[i],h_hist3[i]);
    }

    printf("%12s: %8.4f msec¥n", "One Thread", elapsed3);
    printf("%12s: %8.4f msec¥n", "Atomic", elapsed);
    printf("%12s: %8.4f msec¥n", "Sh_mem", elapsed2);

    return 0;
}

#包括
#包括
#包括
使用名称空间std；
#定义NUMLOOP 1000000
#在36中定义NB
#定义种子1
#定义第128行
#定义NBLOCKS 1
__设备维护（维护和种子）{
#如果长_MAX>（16807*2147483647）
int const a=16807；
int const m=2147483647；
种子=（长（种子*a））%m；
返回种子；
#否则
双常数a=16807；
双常数m=2147483647；
双温=种子*a；
种子=（内部）（温度-米*地板（温度/米））；
返回种子；
#恩迪夫
}
... 上面的内核。。。
int main（）
{
浮动*h_hist、*h_hist2、*h_hist3、*d_hist、*d_hist2、，
*d_hist3；
h_hist=（浮动*）malloc（NBIN*sizeof（浮动））；
h_hist2=（浮动*）malloc（NBIN*sizeof（浮动））；
h_hist3=（浮动*）malloc（NBIN*sizeof（浮动））；
Cudamaloc（（无效**）和d_hist，NBIN*sizeof（浮动））；
Cudamaloc（（无效**）和d_hist2，NBIN*sizeof（浮动））；
Cudamaloc（（无效**）和d_hist3，NBIN*sizeof（浮动））；
对于（inti=0；i

正如@Olegtiov所说，频繁加载/存储全球应尽可能避免使用内存。当有一个这是不可避免的情况，然后合并了记忆访问可以帮助执行过程不会变得太慢；然而，在大多数情况下，直方图计算是相当困难的实现联合接入

这个实验基本上是为了进行一个速度测试总和

atom:  102656.00 shm:  102656.00 glob:  102656.00
atom:  122240.00 shm:  122240.00 glob:  122240.00
... blah blah blah ...

  One Thread: 126.3919 msec
      Atomic:   7.5459 msec
      Sh_mem:   2.2207 msec

#include <iostream>
#include <cstdlib>
#include <cstdio>
using namespace std;

#define NUMLOOP 1000000
#define NBIN 36
#define SEEDRND 1

#define NTHREADS 128
#define NBLOCKS 1

__device__ uint rnd(uint & seed) {
#if LONG_MAX > (16807*2147483647)
    int const a    = 16807;
    int const m    = 2147483647;
    seed = (long(seed * a))%m;
    return seed;
#else
    double const a    = 16807;
    double const m    = 2147483647;

    double temp = seed * a;
    seed = (int) (temp - m * floor(temp/m));
    return seed;
#endif
}

... the above kernels ...

int main()
{
    float *h_hist, *h_hist2, *h_hist3, *d_hist, *d_hist2,
    *d_hist3;
    h_hist = (float*)malloc(NBIN * sizeof(float));
    h_hist2 = (float*)malloc(NBIN * sizeof(float));
    h_hist3 = (float*)malloc(NBIN * sizeof(float));
    cudaMalloc((void**)&d_hist, NBIN * sizeof(float));
    cudaMalloc((void**)&d_hist2, NBIN * sizeof(float));
    cudaMalloc((void**)&d_hist3, NBIN * sizeof(float));

    for (int i = 0; i < NBIN; i++) h_hist[i] = 0.0f;
    cudaMemcpy(d_hist, h_hist, NBIN * sizeof(float),
    cudaMemcpyHostToDevice);
    cudaMemcpy(d_hist2, h_hist, NBIN * sizeof(float),
    cudaMemcpyHostToDevice);
    cudaMemcpy(d_hist3, h_hist, NBIN * sizeof(float),
    cudaMemcpyHostToDevice);

    cudaEvent_t start, end;
    float elapsed = 0, elapsed2 = 0, elapsed3;
    cudaEventCreate(&start);
    cudaEventCreate(&end);

    cudaEventRecord(start, 0);

    atom<<<NBLOCKS, NTHREADS>>>(d_hist);
    cudaThreadSynchronize();

    cudaEventRecord(end, 0);
    cudaEventSynchronize(start);
    cudaEventSynchronize(end);
    cudaEventElapsedTime(&elapsed, start, end);

    cudaEventRecord(start, 0);

    shm<<<NBLOCKS, NTHREADS>>>(d_hist2);
    cudaThreadSynchronize();

    cudaEventRecord(end, 0);
    cudaEventSynchronize(start);
    cudaEventSynchronize(end);
    cudaEventElapsedTime(&elapsed2, start, end);

    cudaEventRecord(start, 0);

    glob<<<1, 1>>>(d_hist3);
    cudaThreadSynchronize();

    cudaEventRecord(end, 0);
    cudaEventSynchronize(start);
    cudaEventSynchronize(end);
    cudaEventElapsedTime(&elapsed3, start, end);

    cudaMemcpy(h_hist, d_hist, NBIN * sizeof(float),
    cudaMemcpyDeviceToHost);
    cudaMemcpy(h_hist2, d_hist2, NBIN * sizeof(float),
    cudaMemcpyDeviceToHost);
    cudaMemcpy(h_hist3, d_hist3, NBIN * sizeof(float),
    cudaMemcpyDeviceToHost);

    /* print output */
    for (int i = 0; i < NBIN; i++) {
        printf("atom: %10.2f shm: %10.2f glob:
    %10.2f¥n",h_hist[i],h_hist2[i],h_hist3[i]);
    }

    printf("%12s: %8.4f msec¥n", "One Thread", elapsed3);
    printf("%12s: %8.4f msec¥n", "Atomic", elapsed);
    printf("%12s: %8.4f msec¥n", "Sh_mem", elapsed2);

    return 0;
}