Performance Cuda的全局内存写入速度非常慢 我现在正在编写一个代码,它使用英伟达推力库计算GPU上的积分直方图。< /P>
因此,我分配了一个连续的设备内存块,我一直用自定义函子更新它 问题是,写入设备内存的速度非常慢,但读取实际上是正常的 基本设置如下所示:Performance Cuda的全局内存写入速度非常慢 我现在正在编写一个代码,它使用英伟达推力库计算GPU上的积分直方图。< /P>,performance,memory,cuda,thrust,Performance,Memory,Cuda,Thrust,因此,我分配了一个连续的设备内存块,我一直用自定义函子更新它 问题是,写入设备内存的速度非常慢,但读取实际上是正常的 基本设置如下所示: struct HistogramCreation { HistogramCreation( ... // pointer to memory ... ){} /// The actual summation operator __device__ void operator()(int index){
struct HistogramCreation
{
HistogramCreation(
...
// pointer to memory
...
){}
/// The actual summation operator
__device__ void operator()(int index){
.. do the calculations ..
for(int j=0;j<30;j++){
(1) *_memoryPointer = values (also using reads to such locations) ;
}
}
}
void foo(){
cudaMalloc(_pointer,size);
HistogramCreation initialCreation( ... _pointer ...);
thrust::for_each(
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(_imageSize),
initialCreation);
}
性能要好得多。这是我唯一的全局内存写入
使用内存写入,我得到大约2秒的高清视频。
使用局部变量大约需要50毫秒,因此大约少了40倍
为什么这么慢?如何改进它?在编写GPU代码时,应避免读写全局内存。GPU上的全局内存非常慢。这就是硬件特性。你唯一能做的就是在全局内存中的相邻地址中读/写相邻的踏板。这将导致凝聚并加快该过程。但一般来说,数据读取一次,处理一次,然后写出一次。在编写GPU代码时,应该避免读写全局内存。GPU上的全局内存非常慢。这就是硬件特性。你唯一能做的就是在全局内存中的相邻地址中读/写相邻的踏板。这将导致凝聚并加快该过程。但一般来说,数据读取一次,处理一次,然后写出一次。正如@Olegtiov所说,频繁地加载/存储全球数据 应尽可能避免使用内存。当有一个 这是不可避免的情况,然后合并了记忆 访问可以帮助执行过程不会变得太慢; 然而,在大多数情况下,直方图计算是相当困难的 实现联合接入 而上述大部分内容基本上只是重复 @Olegtiov的回答,我只想分享一个 我做了一个调查,关于找到NVIDIA的总和 库达。事实上,结果很有趣,我希望 这将是其他xcuda开发人员的有用信息 这个实验基本上是为了进行一个速度测试 各种内存访问模式的求和:使用全局 内存(1个线程)、二级缓存(原子操作-128个线程)和 一级缓存(共享内存-128个线程) 本实验采用: 开普勒GTX 680, 1546芯@1.06GHz GDDR5 256位@3GHz 以下是内核:
__global__
void glob(float *h) {
float* hist = h;
uint sd = SEEDRND;
uint random;
for (int i = 0; i < NUMLOOP; i++) {
if (i%NTHREADS==0) random = rnd(sd);
int rind = random % NBIN;
float randval = (float)(random % 10)*1.0f ;
hist[rind] += randval;
}
}
__global__
void atom(float *h) {
float* hist = h;
uint sd = SEEDRND;
for (int i = threadIdx.x; i < NUMLOOP; i+=NTHREADS) {
uint random = rnd(sd);
int rind = random % NBIN;
float randval = (float)(random % 10)*1.0f ;
atomicAdd(&hist[rind], randval);
}
}
__global__
void shm(float *h) {
int lid = threadIdx.x;
uint sd = SEEDRND;
__shared__ float shm[NTHREADS][NBIN];
for (int i = 0; i < NBIN; i++) shm[lid][i] = h[i];
for (int i = lid; i < NUMLOOP; i+=NTHREADS) {
uint random = rnd(sd);
int rind = random % NBIN;
float randval = (float)(random % 10)*1.0f ;
shm[lid][rind] += randval;
}
/* reduction here */
for (int i = 0; i < NBIN; i++) {
__syncthreads();
if (threadIdx.x < 64) {
shm[threadIdx.x][i] += shm[threadIdx.x+64][i];
}
__syncthreads();
if (threadIdx.x < 32) {
shm[threadIdx.x][i] += shm[threadIdx.x+32][i];
}
__syncthreads();
if (threadIdx.x < 16) {
shm[threadIdx.x][i] += shm[threadIdx.x+16][i];
}
__syncthreads();
if (threadIdx.x < 8) {
shm[threadIdx.x][i] += shm[threadIdx.x+8][i];
}
__syncthreads();
if (threadIdx.x < 4) {
shm[threadIdx.x][i] += shm[threadIdx.x+4][i];
}
__syncthreads();
if (threadIdx.x < 2) {
shm[threadIdx.x][i] += shm[threadIdx.x+2][i];
}
__syncthreads();
if (threadIdx.x == 0) {
shm[0][i] += shm[1][i];
}
}
for (int i = 0; i < NBIN; i++) h[i] = shm[0][i];
}
这些果仁之间的比例是57:17:1。很多事情都可以
在这里进行分析,这并不意味着使用
L1或L2内存空间将始终提供10个以上
整个程序的加速倍数
以下是主要功能和其他功能:
#include <iostream>
#include <cstdlib>
#include <cstdio>
using namespace std;
#define NUMLOOP 1000000
#define NBIN 36
#define SEEDRND 1
#define NTHREADS 128
#define NBLOCKS 1
__device__ uint rnd(uint & seed) {
#if LONG_MAX > (16807*2147483647)
int const a = 16807;
int const m = 2147483647;
seed = (long(seed * a))%m;
return seed;
#else
double const a = 16807;
double const m = 2147483647;
double temp = seed * a;
seed = (int) (temp - m * floor(temp/m));
return seed;
#endif
}
... the above kernels ...
int main()
{
float *h_hist, *h_hist2, *h_hist3, *d_hist, *d_hist2,
*d_hist3;
h_hist = (float*)malloc(NBIN * sizeof(float));
h_hist2 = (float*)malloc(NBIN * sizeof(float));
h_hist3 = (float*)malloc(NBIN * sizeof(float));
cudaMalloc((void**)&d_hist, NBIN * sizeof(float));
cudaMalloc((void**)&d_hist2, NBIN * sizeof(float));
cudaMalloc((void**)&d_hist3, NBIN * sizeof(float));
for (int i = 0; i < NBIN; i++) h_hist[i] = 0.0f;
cudaMemcpy(d_hist, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(d_hist2, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(d_hist3, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaEvent_t start, end;
float elapsed = 0, elapsed2 = 0, elapsed3;
cudaEventCreate(&start);
cudaEventCreate(&end);
cudaEventRecord(start, 0);
atom<<<NBLOCKS, NTHREADS>>>(d_hist);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed, start, end);
cudaEventRecord(start, 0);
shm<<<NBLOCKS, NTHREADS>>>(d_hist2);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed2, start, end);
cudaEventRecord(start, 0);
glob<<<1, 1>>>(d_hist3);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed3, start, end);
cudaMemcpy(h_hist, d_hist, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(h_hist2, d_hist2, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(h_hist3, d_hist3, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
/* print output */
for (int i = 0; i < NBIN; i++) {
printf("atom: %10.2f shm: %10.2f glob:
%10.2f¥n",h_hist[i],h_hist2[i],h_hist3[i]);
}
printf("%12s: %8.4f msec¥n", "One Thread", elapsed3);
printf("%12s: %8.4f msec¥n", "Atomic", elapsed);
printf("%12s: %8.4f msec¥n", "Sh_mem", elapsed2);
return 0;
}
#包括
#包括
#包括
使用名称空间std;
#定义NUMLOOP 1000000
#在36中定义NB
#定义种子1
#定义第128行
#定义NBLOCKS 1
__设备维护(维护和种子){
#如果长_MAX>(16807*2147483647)
int const a=16807;
int const m=2147483647;
种子=(长(种子*a))%m;
返回种子;
#否则
双常数a=16807;
双常数m=2147483647;
双温=种子*a;
种子=(内部)(温度-米*地板(温度/米));
返回种子;
#恩迪夫
}
... 上面的内核。。。
int main()
{
浮动*h_hist、*h_hist2、*h_hist3、*d_hist、*d_hist2、,
*d_hist3;
h_hist=(浮动*)malloc(NBIN*sizeof(浮动));
h_hist2=(浮动*)malloc(NBIN*sizeof(浮动));
h_hist3=(浮动*)malloc(NBIN*sizeof(浮动));
Cudamaloc((无效**)和d_hist,NBIN*sizeof(浮动));
Cudamaloc((无效**)和d_hist2,NBIN*sizeof(浮动));
Cudamaloc((无效**)和d_hist3,NBIN*sizeof(浮动));
对于(inti=0;i
正如@Olegtiov所说,频繁加载/存储全球
应尽可能避免使用内存。当有一个
这是不可避免的情况,然后合并了记忆
访问可以帮助执行过程不会变得太慢;
然而,在大多数情况下,直方图计算是相当困难的
实现联合接入
而上述大部分内容基本上只是重复
@Olegtiov的回答,我只想分享一个
我做了一个调查,关于找到NVIDIA的总和
库达。事实上,结果很有趣,我希望
这将是其他xcuda开发人员的有用信息
这个实验基本上是为了进行一个速度测试
总和
atom: 102656.00 shm: 102656.00 glob: 102656.00
atom: 122240.00 shm: 122240.00 glob: 122240.00
... blah blah blah ...
One Thread: 126.3919 msec
Atomic: 7.5459 msec
Sh_mem: 2.2207 msec
#include <iostream>
#include <cstdlib>
#include <cstdio>
using namespace std;
#define NUMLOOP 1000000
#define NBIN 36
#define SEEDRND 1
#define NTHREADS 128
#define NBLOCKS 1
__device__ uint rnd(uint & seed) {
#if LONG_MAX > (16807*2147483647)
int const a = 16807;
int const m = 2147483647;
seed = (long(seed * a))%m;
return seed;
#else
double const a = 16807;
double const m = 2147483647;
double temp = seed * a;
seed = (int) (temp - m * floor(temp/m));
return seed;
#endif
}
... the above kernels ...
int main()
{
float *h_hist, *h_hist2, *h_hist3, *d_hist, *d_hist2,
*d_hist3;
h_hist = (float*)malloc(NBIN * sizeof(float));
h_hist2 = (float*)malloc(NBIN * sizeof(float));
h_hist3 = (float*)malloc(NBIN * sizeof(float));
cudaMalloc((void**)&d_hist, NBIN * sizeof(float));
cudaMalloc((void**)&d_hist2, NBIN * sizeof(float));
cudaMalloc((void**)&d_hist3, NBIN * sizeof(float));
for (int i = 0; i < NBIN; i++) h_hist[i] = 0.0f;
cudaMemcpy(d_hist, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(d_hist2, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaMemcpy(d_hist3, h_hist, NBIN * sizeof(float),
cudaMemcpyHostToDevice);
cudaEvent_t start, end;
float elapsed = 0, elapsed2 = 0, elapsed3;
cudaEventCreate(&start);
cudaEventCreate(&end);
cudaEventRecord(start, 0);
atom<<<NBLOCKS, NTHREADS>>>(d_hist);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed, start, end);
cudaEventRecord(start, 0);
shm<<<NBLOCKS, NTHREADS>>>(d_hist2);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed2, start, end);
cudaEventRecord(start, 0);
glob<<<1, 1>>>(d_hist3);
cudaThreadSynchronize();
cudaEventRecord(end, 0);
cudaEventSynchronize(start);
cudaEventSynchronize(end);
cudaEventElapsedTime(&elapsed3, start, end);
cudaMemcpy(h_hist, d_hist, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(h_hist2, d_hist2, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
cudaMemcpy(h_hist3, d_hist3, NBIN * sizeof(float),
cudaMemcpyDeviceToHost);
/* print output */
for (int i = 0; i < NBIN; i++) {
printf("atom: %10.2f shm: %10.2f glob:
%10.2f¥n",h_hist[i],h_hist2[i],h_hist3[i]);
}
printf("%12s: %8.4f msec¥n", "One Thread", elapsed3);
printf("%12s: %8.4f msec¥n", "Atomic", elapsed);
printf("%12s: %8.4f msec¥n", "Sh_mem", elapsed2);
return 0;
}