C++ CUDA:不使用';我不适合各种尺寸
我在做一个三维拉普拉斯算子。我的代码在大小N=32时成功,但在大小N=64或N=128时,我得到了一些不正确的结果:C++ CUDA:不使用';我不适合各种尺寸,c++,cuda,C++,Cuda,我在做一个三维拉普拉斯算子。我的代码在大小N=32时成功,但在大小N=64或N=128时,我得到了一些不正确的结果: #include <iostream> #include <sys/time.h> #include <cuda.h> #include <ctime> #include"res3dcb.cuh" #include <math.h> using namespace std; // Let's start the mai
#include <iostream>
#include <sys/time.h>
#include <cuda.h>
#include <ctime>
#include"res3dcb.cuh"
#include <math.h>
using namespace std;
// Let's start the main program.
int main(void) {
// Choice of N.
int N;
cout<<"Choose matrix dimension (32, 64 or 128)"<<endl;
cin>>N;
int size=(N+2)*(N+2)*(N+2)*sizeof(float);
// Variable statement.
struct timeval t1, t2;
float *x_d, *y_d;
float *x,*y;
float gflops;
float NumOps;
//Init x and y.
x = new float[size];
y = new float[size];
for (int i=1;i<N+1;i++)
for (int j=1;j<N+1;j++)
for (int k=1;k<N+1;k++) {
x[i*(N+2)*(N+2)+j*(N+2)+k]=1;
}
// Shadow cases.
for (int i=1;i<N+1;i++) {
for (int j=1;j<N+1;j++) {
x[i*(N+2)*(N+2)+j*(N+2)]=x[i*(N+2)*(N+2)+j*(N+2)+1];
x[i*(N+2)*(N+2)+j*(N+2)+N+1]=x[i*(N+2)*(N+2)+j*(N+2)+N];
}
for (int k=0;k<N+2;k++) {
x[i*(N+2)*(N+2)+k]=x[i*(N+2)*(N+2)+(N+2)+k];
x[i*(N+2)*(N+2)+(N+1)*(N+2)+k]=x[i*(N+2)*(N+2)+N*(N+2)+k];}
}
for (int j=0;j<N+2;j++)
for (int k=0;k<N+2;k++) {
x[(N+2)*j+k]=x[(N+2)*(N+2)+(N+2)*j+k];
x[(N+1)*(N+2)*(N+2)+(N+2)*j+k]=x[(N+2)*(N+2)*N+(N+2)*j+k];
}
// Display of initial matrix.
int id_stage=-2;
while (id_stage!=-1) {
cout<<"Which initial matrix's stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
cin>>id_stage;
cout<<endl;
if (id_stage != -1) {
cout<<"Etage "<<id_stage<<" du cube :"<<endl;
for (int j=0;j<N+2;j++) {
cout<<"| ";
for (int k=0;k<N+2;k++) {cout<<x[id_stage*(N+2)*(N+2)+j*(N+2)+k]<<" ";}
cout<<"|"<<endl;
}
cout<<endl;
}
}
// CPU to GPU.
cudaMalloc( (void**) & x_d, size);
cudaMalloc( (void**) & y_d, size);
cudaMemcpy(x_d, x, size, cudaMemcpyHostToDevice) ;
cudaMemcpy(y_d, y, size, cudaMemcpyHostToDevice) ;
// Solver parameters.
dim3 dimGrid(N/32, N/8, N/8);
dim3 dimBlock(16, 8, 8);
// Solver loop.
gettimeofday(&t1, 0);
res3d<<<dimGrid, dimBlock>>>(x_d, y_d, N);
cudaDeviceSynchronize();
gettimeofday(&t2, 0);
double time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000000.0;
// Power calculation.
NumOps=(1.0e-9)*N*N*N*7;
gflops = ( NumOps / (time));
// GPU to CPU.
cudaMemcpy(y, y_d, size, cudaMemcpyDeviceToHost);
cudaFree(x_d);
cudaFree(y_d);
// Display of final matrix.
id_stage=-2;
while (id_stage!=-1) {
cout<<"Which output's stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
cin>>id_stage;
cout<<endl;
if (id_stage != -1) {
cout<<"Etage "<<id_stage<<" du cube :"<<endl;
for (int j=0;j<N+2;j++) {
cout<<"| ";
for (int k=0;k<N+2;k++) {cout<<y[id_stage*(N+2)*(N+2)+j*(N+2)+k]<<" ";}
cout<<"|"<<endl;
}
cout<<endl;
}
}
cout<<"Time : "<<time<<endl;
cout<<"Gflops/s : "<<gflops<<endl;
}
#包括
#包括
#包括
#包括
#包括“res3dcb.cuh”
#包括
使用名称空间std;
//让我们开始主程序。
内部主(空){
//选择。
int N;
难道你这里有个错误吗
dim3 dimGrid(N/32, N/8, N/8);
dim3 dimBlock(16, 8, 8);
这应该是:
dim3 dimGrid(N/16, N/8, N/8);
dim3 dimBlock(16, 8, 8);
此外,如评论中所述,您在此处过度分配内存:
x = new float[size];
y = new float[size];
因为size
是以字节计算的,而不是以元素计算的。我发现了错误。DimGrid和DimBlock没有错,因为我在x轴上耕作
错误是我在全局内核中的“if”。下面是一个性能更好、结果正确的算法:
#include <assert.h>
#include <stdio.h>
#include <iostream>
#include <sys/time.h>
#include <cuda.h>
#include <ctime>
#include <math.h>
#include"reslap3D.cu"
using namespace std;
// Let's start the main program.
int main(void) {
// Variable statement.
struct timeval t1, t2;
float gflops;
float NumOps;
double time;
long int N=128;
int size=(N+2);
int size3=size*size*size*sizeof(float);
float *x = new float[size3];
float *y = new float[size3];
float *d_x;
float *d_y;
//Init x.
for (int i=1;i<N+1;i++)
for (int j=1;j<N+1;j++)
for (int k=1;k<N+1;k++)
x[size*size*i+size*j+k]=cos(k);
// Shadow cells.
for (int i=1;i<N+1;i++) {
for (int j=1;j<N+1;j++) { x[i*(N+2)*(N+2)+j*(N+2)]=x[i*(N+2)*(N+2)+j*(N+2)+1]; x[i*(N+2)*(N+2)+j*(N+2)+N+1]=x[i*(N+2)*(N+2)+j*(N+2)+N];}
for (int k=0;k<N+2;k++) { x[i*(N+2)*(N+2)+k]=x[i*(N+2)*(N+2)+(N+2)+k]; x[i*(N+2)*(N+2)+(N+1)*(N+2)+k]=x[i*(N+2)*(N+2)+N*(N+2)+k];}
}
// CPU to GPU.
cudaMalloc((void **) &d_x, size3);
cudaMalloc((void **) &d_y, size3);
cudaMemcpy(d_x, x, size3, cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, size3, cudaMemcpyHostToDevice);
// Solver parameters.
dim3 dimBlock(2, 2, 64);
dim3 dimGrid(64, 64);
// Solver loop.
gettimeofday(&t1, 0);
kernel1 <<<dimGrid, dimBlock>>> (d_x, d_y, size, N);
cudaDeviceSynchronize();
gettimeofday(&t2, 0);
time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000000.0;
// GPU to CPU.
cudaMemcpy(y, d_y, size3, cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_y);
// Power calculation.
NumOps=(1.0e-9)*N*N*N*7;
gflops = ( NumOps / (time));
// Display of final matrix.
int id_stage=-2;
while (id_stage!=-1) {
cout<<"Which output's stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
cin>>id_stage;
cout<<endl;
if (id_stage != -1) {
cout<<"Stage "<<id_stage<<" of cube :"<<endl;
for (int j=0;j<N+2;j++) {
cout<<"| ";
for (int k=0;k<N+2;k++) {cout<<y[id_stage*(N+2)*(N+2)+j*(N+2)+k]<<" ";}
cout<<"|"<<endl;
}
cout<<endl;
}
}
// Display of performances.
cout<<"Time : "<<time<<endl;
cout<<"Gflops/s : "<<gflops<<endl;
}
}你说的“结果是假的”是什么意思?数据结构是否包含意外数据?哪一个?在哪个阶段?事实上,我的结果应该只有0。当N=32时是这样,但如果N=64或128,则会出现一些1和2,如下所示:。N=128的示例听起来像是代码审查的问题,但事实并非如此。当您为x
和y
调用new
时,为什么只为malloc
指定有意义的size
?
#include <assert.h>
#include <stdio.h>
#include <iostream>
#include <sys/time.h>
#include <cuda.h>
#include <ctime>
#include <math.h>
#include"reslap3D.cu"
using namespace std;
// Let's start the main program.
int main(void) {
// Variable statement.
struct timeval t1, t2;
float gflops;
float NumOps;
double time;
long int N=128;
int size=(N+2);
int size3=size*size*size*sizeof(float);
float *x = new float[size3];
float *y = new float[size3];
float *d_x;
float *d_y;
//Init x.
for (int i=1;i<N+1;i++)
for (int j=1;j<N+1;j++)
for (int k=1;k<N+1;k++)
x[size*size*i+size*j+k]=cos(k);
// Shadow cells.
for (int i=1;i<N+1;i++) {
for (int j=1;j<N+1;j++) { x[i*(N+2)*(N+2)+j*(N+2)]=x[i*(N+2)*(N+2)+j*(N+2)+1]; x[i*(N+2)*(N+2)+j*(N+2)+N+1]=x[i*(N+2)*(N+2)+j*(N+2)+N];}
for (int k=0;k<N+2;k++) { x[i*(N+2)*(N+2)+k]=x[i*(N+2)*(N+2)+(N+2)+k]; x[i*(N+2)*(N+2)+(N+1)*(N+2)+k]=x[i*(N+2)*(N+2)+N*(N+2)+k];}
}
// CPU to GPU.
cudaMalloc((void **) &d_x, size3);
cudaMalloc((void **) &d_y, size3);
cudaMemcpy(d_x, x, size3, cudaMemcpyHostToDevice);
cudaMemcpy(d_y, y, size3, cudaMemcpyHostToDevice);
// Solver parameters.
dim3 dimBlock(2, 2, 64);
dim3 dimGrid(64, 64);
// Solver loop.
gettimeofday(&t1, 0);
kernel1 <<<dimGrid, dimBlock>>> (d_x, d_y, size, N);
cudaDeviceSynchronize();
gettimeofday(&t2, 0);
time = (1000000.0*(t2.tv_sec-t1.tv_sec) + t2.tv_usec-t1.tv_usec)/1000000.0;
// GPU to CPU.
cudaMemcpy(y, d_y, size3, cudaMemcpyDeviceToHost);
cudaFree(d_x);
cudaFree(d_y);
// Power calculation.
NumOps=(1.0e-9)*N*N*N*7;
gflops = ( NumOps / (time));
// Display of final matrix.
int id_stage=-2;
while (id_stage!=-1) {
cout<<"Which output's stage do you want to display? (-1 if you don't want to diplay another one)"<<endl;
cin>>id_stage;
cout<<endl;
if (id_stage != -1) {
cout<<"Stage "<<id_stage<<" of cube :"<<endl;
for (int j=0;j<N+2;j++) {
cout<<"| ";
for (int k=0;k<N+2;k++) {cout<<y[id_stage*(N+2)*(N+2)+j*(N+2)+k]<<" ";}
cout<<"|"<<endl;
}
cout<<endl;
}
}
// Display of performances.
cout<<"Time : "<<time<<endl;
cout<<"Gflops/s : "<<gflops<<endl;
}
#define D(x,y,z) size*size*(x)+size*(y)+z
__global__ void kernel1(float *x, float *y, int size, int N)
{
__shared__ float sdata0[4][4][66];
__shared__ float sdata64[4][4][66];
int c0 = blockIdx.x*blockDim.x + threadIdx.x+1;
int c1 = blockIdx.y*blockDim.y + threadIdx.y+1;
int c2 = threadIdx.z+1;
int i = threadIdx.x+1, j = threadIdx.y+1, k = threadIdx.z+1;
if (threadIdx.x == 0)
{ sdata0[i-1][j][k] = x[D(c0-1,c1,c2)];
sdata64[i-1][j][k] = x[D(c0-1,c1,c2+64)];
}
if (threadIdx.x == 1)
{ sdata0[i+1][j][k] = x[D(c0+1,c1,c2)];
sdata64[i+1][j][k] = x[D(c0+1,c1,c2+64)];
}
if (threadIdx.y == 0)
{ sdata0[i][j-1][k] = x[D(c0,c1-1,c2)];
sdata64[i][j-1][k] = x[D(c0,c1-1,c2+64)];
}
if (threadIdx.y == 1)
{ sdata0[i][j+1][k] = x[D(c0,c1+1,c2)];
sdata64[i][j+1][k] = x[D(c0,c1+1,c2+64)];
}
if (threadIdx.z == 0)
{ sdata0[i][j][k-1] = x[D(c0,c1,c2-1)];
sdata64[i][j][k-1] = x[D(c0,c1,c2+63)];
}
if (threadIdx.z == 63)
{ sdata0[i][j][k+1] = x[D(c0,c1,c2+1)];
sdata64[i][j][k+1] = x[D(c0,c1,c2+65)];
}
sdata0[i][j][k] = x[D(c0,c1,c2)];
sdata64[i][j][k] = x[D(c0,c1,c2+64)];
__syncthreads();
y[D(c0, c1, c2)] = sdata0[i+1][j][k]
+ sdata0[i-1][j][k]
+ sdata0[i][j+1][k]
+ sdata0[i][j-1][k]
+ sdata0[i][j][k+1]
+ sdata0[i][j][k-1]
- 6 * sdata0[i][j][k];
y[D(c0, c1, c2+64)] = sdata64[i+1][j][k]
+ sdata64[i-1][j][k]
+ sdata64[i][j+1][k]
+ sdata64[i][j-1][k]
+ sdata64[i][j][k+1]
+ sdata64[i][j][k-1]
- 6 * sdata64[i][j][k];