Cuda 表面内存比全局内存占用更多的时间(两倍)
我正在研究cuda程序的优化。所以我首先从优化矩阵乘法程序开始。我用于并行化的线程方案是Blocksize(1,1),Gridsize(N,N)。我使用表面内存进行内存优化(因为此线程方案不可能使用共享内存)。当我比较优化前后的时间时,我发现在使用表面内存后执行需要两倍的时间(我尝试了不同的线程方案,但问题仍然相同)。无论我读到什么,全局内存都比表面内存慢。所以使用表面存储器应该花费更少的时间。下面我将给出使用表面存储器的矩阵乘法程序。谁能告诉我有什么问题吗Cuda 表面内存比全局内存占用更多的时间(两倍),cuda,Cuda,我正在研究cuda程序的优化。所以我首先从优化矩阵乘法程序开始。我用于并行化的线程方案是Blocksize(1,1),Gridsize(N,N)。我使用表面内存进行内存优化(因为此线程方案不可能使用共享内存)。当我比较优化前后的时间时,我发现在使用表面内存后执行需要两倍的时间(我尝试了不同的线程方案,但问题仍然相同)。无论我读到什么,全局内存都比表面内存慢。所以使用表面存储器应该花费更少的时间。下面我将给出使用表面存储器的矩阵乘法程序。谁能告诉我有什么问题吗 #include < stdi
#include < stdio.h >
#include < cuda.h >
//#define N 3
surface < void, 2 > a_surf;
surface < void, 2 > b_surf;
surface < void, 2 > c_surf;
void CUDA_SAFE_CALL(cudaError_t call, int line) {
switch (call) {
case cudaSuccess:
break;
default:
printf("ERROR at line :%i.%d' ' %s\n",
line, call, cudaGetErrorString(call));
exit(-1);
break;
}
}
__global__ void mul(int N) {
int a, b, c, temp;
int i;
unsigned int x = blockIdx.x * blockDim.x + (threadIdx.x);
unsigned int y = blockIdx.y * blockDim.y + (threadIdx.y);
if (x < N && y < N) {
temp = 0;
for (i = 0; i < N; i++) {
surf2Dread( & a, a_surf, (x) * 4, i);
surf2Dread( & b, b_surf, (i) * 4, y);
temp += a * b;
}
c = temp;
// Write to output surface
surf2Dwrite(c, c_surf, x * 4, y);
}
}
int main() {
int N = 100;
int a[N][N], b[N][N], c[N][N];
int i, j;
int temp;
clock_t t1, t2;
cudaArray * da, * db, * dc;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc < int > ();
dim3 dimBlock(1, 1);
dim3 dimGrid(N, N);
temp = 0;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
a[i][j] = ++temp;
temp = 0;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
b[i][j] = ++temp;
CUDA_SAFE_CALL(cudaMallocArray( & da, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & db, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & dc, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
int s = N * N * sizeof(int);
CUDA_SAFE_CALL(cudaMemcpyToArray(da, 0, 0, a, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaMemcpyToArray(db, 0, 0, b, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(a_surf, da), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(b_surf, db), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(c_surf, dc), __LINE__);
t1 = clock();
mul <<<dimGrid, dimBlock>>> (N);
t2 = clock();
CUDA_SAFE_CALL(cudaMemcpyFromArray(c, dc, 0, 0, s, cudaMemcpyDeviceToHost), __LINE__);
double t3 = (double) t2 - (double) t1;
t3 = t3 / CLOCKS_PER_SEC;
printf("\n CUDA time :%lf", t3);
CUDA_SAFE_CALL(cudaFreeArray(da), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(db), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(dc), __LINE__);
}
#包括
#包括
//#定义n3
表面a_surf;
表面b_surf;
表面c_-surf;
无效CUDA_安全呼叫(cudaError_t呼叫,内线){
交换机(呼叫){
成功案例:
打破
违约:
printf(“第%i.%d'%s行出现错误”,
行、调用、cudaGetErrorString(调用));
出口(-1);
打破
}
}
__全局无效mul(整数N){
内部a、b、c、温度;
int i;
无符号整数x=blockIdx.x*blockDim.x+(threadIdx.x);
无符号整数y=blockIdx.y*blockDim.y+(threadIdx.y);
if(x();
dim3 dimBlock(1,1);
dim3 dimGrid(N,N);
温度=0;
对于(i=0;i
优化缓存不是一件小事。所以像这样的简单化概括:
无论我读到什么,全局内存都比表面内存慢。因此,使用表面存储器应该花费更少的时间
在我看来,它们的范围太广,以至于是不正确的。这通常是正确的,但并不总是正确的。细节很重要,正确的编程实践也很重要
表面内存不过是带有中间缓存的全局内存。但是全局内存(在当前CUDA版本支持的所有GPU上)已经有了二级(在某些情况下还有一级)缓存的支持
您建议用于测试/比较的代码有许多问题,我要指出:
t1 = clock();
mul <<<dimGrid, dimBlock>>> (N);
t2 = clock();
因为在每个GPU扭曲中,每32个线程中有31个线程处于非活动状态,所以GPU性能的31/32将处于未使用状态。这具有广泛的影响。我对研究这样一个场景的性能没有兴趣,你也不应该这样做(因为它不能反映编写良好的代码的真实性能),除非你对微基准(而不是比较基准)感兴趣。因此,您的代码应该固定为至少处理32个线程,最好是每个块处理256个或更多线程dim3 dimBlock(1, 1);
$ cat t1129.cu
#include <stdio.h>
#include <iostream>
typedef int mytype;
const int blk_dim=16;
#define my_N 1000
#define A_VAL 1
#define B_VAL 2
surface < void, 2 > a_surf;
surface < void, 2 > b_surf;
surface < void, 2 > c_surf;
void CUDA_SAFE_CALL(cudaError_t call, int line) {
switch (call) {
case cudaSuccess:
break;
default:
printf("ERROR at line :%i.%d' ' %s\n",
line, call, cudaGetErrorString(call));
exit(-1);
break;
}
}
#ifdef USE_GLOBAL
__global__ void mul(const mytype * __restrict__ d_a, const mytype * __restrict__ d_b, mytype * __restrict__ d_c, const int N)
#else
__global__ void mul(const int N)
#endif
{
mytype a, b, c, temp;
int i;
unsigned int x = blockIdx.x * blockDim.x + (threadIdx.x);
unsigned int y = blockIdx.y * blockDim.y + (threadIdx.y);
if (x < N && y < N) {
temp = 0;
for (i = 0; i < N; i++) {
#ifdef USE_GLOBAL
a = d_a[x*N+i];
b = d_b[i*N+y];
#else
surf2Dread( & a, a_surf, (x) * sizeof(mytype), i);
surf2Dread( & b, b_surf, (i) * sizeof(mytype), y);
#endif
temp += a * b;
}
c = temp;
#ifdef USE_GLOBAL
d_c[x*N+y] = c;
#else
// Write to output surface
surf2Dwrite(c, c_surf, x * sizeof(mytype), y);
#endif
}
}
int main() {
const int N = my_N;
mytype *a, *b, *c, *d_a, *d_b, *d_c;
int i, j;
clock_t t1, t2;
cudaArray * da, * db, * dc;
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc < mytype > ();
dim3 dimBlock(blk_dim, blk_dim);
dim3 dimGrid((N+dimBlock.x-1)/dimBlock.x, (N+dimBlock.y-1)/dimBlock.y);
int s = N * N * sizeof(mytype);
a = (mytype *)malloc(s);
b = (mytype *)malloc(s);
c = (mytype *)malloc(s);
CUDA_SAFE_CALL(cudaMalloc(&d_a, s), __LINE__);
CUDA_SAFE_CALL(cudaMalloc(&d_b, s), __LINE__);
CUDA_SAFE_CALL(cudaMalloc(&d_c, s), __LINE__);
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
a[i*N+j] = A_VAL;
for (i = 0; i < N; i++)
for (j = 0; j < N; j++)
b[i*N+j] = B_VAL;
CUDA_SAFE_CALL(cudaMallocArray( & da, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & db, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMallocArray( & dc, & channelDesc, N, N, cudaArraySurfaceLoadStore), __LINE__);
CUDA_SAFE_CALL(cudaMemcpyToArray(da, 0, 0, a, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaMemcpyToArray(db, 0, 0, b, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(a_surf, da), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(b_surf, db), __LINE__);
CUDA_SAFE_CALL(cudaBindSurfaceToArray(c_surf, dc), __LINE__);
#ifdef USE_GLOBAL
CUDA_SAFE_CALL(cudaMemcpy(d_a, a, s, cudaMemcpyHostToDevice), __LINE__);
CUDA_SAFE_CALL(cudaMemcpy(d_b, b, s, cudaMemcpyHostToDevice), __LINE__);
#endif
t1 = clock();
#ifdef USE_GLOBAL
mul <<<dimGrid, dimBlock>>> (d_a, d_b, d_c, N);
#else
mul <<<dimGrid, dimBlock>>> (N);
#endif
cudaDeviceSynchronize();
t2 = clock();
CUDA_SAFE_CALL(cudaMemcpyFromArray(c, dc, 0, 0, s, cudaMemcpyDeviceToHost), __LINE__);
#ifdef USE_GLOBAL
CUDA_SAFE_CALL(cudaMemcpy(c, d_c, s, cudaMemcpyDeviceToHost), __LINE__);
#endif
double t3 = (double) t2 - (double) t1;
t3 = t3 / CLOCKS_PER_SEC;
printf("\n CUDA time :%lf\n", t3);
for (i=0; i < N*N; i++)
if(c[i] != A_VAL*B_VAL*N) {std::cout << "mismatch at: " << i << ", was: " << c[i] << " should be: " << A_VAL*B_VAL*N << std::endl; return 1;}
CUDA_SAFE_CALL(cudaFreeArray(da), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(db), __LINE__);
CUDA_SAFE_CALL(cudaFreeArray(dc), __LINE__);
std::cout << "Success!" << std::endl;
return 0;
}
[bob@cluster1 misc]$ nvcc -O3 -o t1129 t1129.cu
[bob@cluster1 misc]$ ./t1129
CUDA time :0.028771
Success!
$ nvcc -O3 -DUSE_GLOBAL -o t1129 t1129.cu
$ ./t1129
CUDA time :0.243635
Success!
$