Cuda 如何在gpu中创建矩阵并在cpu上打印?
这是一个在gpu上创建矩阵并在cpu上打印出来的代码。谁能告诉我哪里出了问题。多谢各位Cuda 如何在gpu中创建矩阵并在cpu上打印?,cuda,dynamic-memory-allocation,Cuda,Dynamic Memory Allocation,这是一个在gpu上创建矩阵并在cpu上打印出来的代码。谁能告诉我哪里出了问题。多谢各位 # include <stdio.h> __global__ void create(int **d_a){ int i = threadIdx.x; int j = threadIdx.y; d_a[i][j] = 1; } void errorCheck(){ cudaError_t error = cudaGetLastError(); if
# include <stdio.h>
__global__ void create(int **d_a){
int i = threadIdx.x;
int j = threadIdx.y;
d_a[i][j] = 1;
}
void errorCheck(){
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess){
// print the CUDA error message and exit
printf("CUDA error: %s\n", cudaGetErrorString(error));
exit(-1);
}
}
# define N 5
int main(){
int **d_a, **a;
a = (int**)malloc(N * sizeof(int*));
for (int i =0; i < N; i++){
a[i] = (int*)malloc(N*sizeof(int));
}
cudaMalloc((void***)&d_a, N*sizeof(int*));
for (int i =0; i < N; i++){
cudaMalloc((void**)&d_a,N*sizeof(int));
}
errorCheck();
create <<<1, N>>>(d_a);
errorCheck();
cudaMemcpy(a, d_a, (N*N)*sizeof(int),cudaMemcpyDeviceToHost);
for (int i =0; i < N; i++ ){
for (int j = 0; j < N; j++ ){
printf("%d", a[i][j]);
}
printf("\n");
}
cudaFree(d_a);
free(a);
return 0;
}
#包括
__全局无效创建(int**d\u a){
int i=threadIdx.x;
int j=螺纹内径x.y;
d_a[i][j]=1;
}
无效错误检查(){
cudaError_t error=cudaGetLastError();
如果(错误!=cudaSuccess){
//打印CUDA错误消息并退出
printf(“CUDA错误:%s\n”,cudaGetErrorString(错误));
出口(-1);
}
}
#定义n5
int main(){
国际**d_a,**a;
a=(int**)malloc(N*sizeof(int*);
对于(int i=0;i
内存分配或memcpy是否有问题
内存分配或memcpy是否有问题
两方面都是
这:
cudamaloc((void***)和d_a,N*sizeof(int*);
对于(int i=0;i
必须这样做:
cudaMalloc((void***)&d_a, N*sizeof(int*));
for (int i=0; i < N; i++){
int *row;
cudaMalloc((void**)&row, N*sizeof(int));
cudaMemcpy(d_a+i, &row, sizeof(int*), cudaMemcpyHostToDevice);
}
create <<<1, dim3(N,N)>>>(d_a);
errorCheck();
for(int i=0; i<N; i++) {
int* row;
cudaMemcpy(&row, d_a+i, sizeof(int*), cudaMemcpyDeviceToHost);
cudaMemcpy(a[i], row, sizeof(int) * N, cudaMemcpyDeviceToHost);
}
cudamaloc((void***)和d_a,N*sizeof(int*);
对于(int i=0;i
然后这个:
create <<<1, N>>>(d_a);
errorCheck();
cudaMemcpy(a, d_a, (N*N)*sizeof(int),cudaMemcpyDeviceToHost);
创建(d_a);
错误检查();
cudaMemcpy(a,d_a,(N*N)*sizeof(int),cudaMemcpyDeviceToHost);
必须这样做:
cudaMalloc((void***)&d_a, N*sizeof(int*));
for (int i=0; i < N; i++){
int *row;
cudaMalloc((void**)&row, N*sizeof(int));
cudaMemcpy(d_a+i, &row, sizeof(int*), cudaMemcpyHostToDevice);
}
create <<<1, dim3(N,N)>>>(d_a);
errorCheck();
for(int i=0; i<N; i++) {
int* row;
cudaMemcpy(&row, d_a+i, sizeof(int*), cudaMemcpyDeviceToHost);
cudaMemcpy(a[i], row, sizeof(int) * N, cudaMemcpyDeviceToHost);
}
创建(d_a);
错误检查();
对于(int i=0;i