CUDA无法再将数据从设备复制到主机；“坏的”；对函数的调用_C_Cuda

CUDA无法再将数据从设备复制到主机；“坏的”；对函数的调用

c cuda

CUDA无法再将数据从设备复制到主机；“坏的”；对函数的调用,c,cuda,C,Cuda,我正在测试一段代码，其中内核要在存储在两个指针中的两个值之间执行简单的求和在调用内核“add”之后，我不能再将指针的数据从主机复制到设备，也不能再从主机复制到主机，即使没有对内核中的指针执行任何操作。但是当我对调用函数的语句进行注释时，我得到了正确的结果。代码如下： #include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> __global__ void add(int *a, int

我正在测试一段代码，其中内核要在存储在两个指针中的两个值之间执行简单的求和

在调用内核“add”之后，我不能再将指针的数据从主机复制到设备，也不能再从主机复制到主机，即使没有对内核中的指针执行任何操作。但是当我对调用函数的语句进行注释时，我得到了正确的结果。代码如下：

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

__global__ void add(int *a, int *b, int *c)
{
*c = *a - *b;
}

int main(void)
{
int result, x_val, y_val; //Store data from device to host in this vars.
int *x_host, *y_host; //Pointers in host
int *tempGPU, *x_dev, *y_dev; //Pointers in device

x_host = (int *)malloc(sizeof(int));
y_host = (int *)malloc(sizeof(int));

*x_host = 8;
*y_host = 4;

x_val = -5;
y_val = -10;

printf("\n x = %d, y = %d\n", *x_host, *y_host);

cudaMalloc( (void **)&tempGPU, sizeof(int) );

//It's wrong to pass this arguments to the function. The problem is in this statement.
add<<<1,1>>> (x_host, y_host, tempGPU);

cudaMemcpy(&result, tempGPU, sizeof(int), cudaMemcpyDeviceToHost);

printf("\n x_host - y_host = %d\n", result);

cudaMalloc( (void **)&x_dev, sizeof(int) );
cudaMalloc( (void **)&y_dev, sizeof(int) );

*x_host = 6;
*y_host = 20;

cudaMemcpy(x_dev, x_host, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(y_dev, y_host, sizeof(int), cudaMemcpyHostToDevice);

cudaMemcpy(&x_val, x_dev, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&y_val, y_dev, sizeof(int), cudaMemcpyDeviceToHost);

printf("\n x_host = %d, y_host = %d\n", *x_host, *y_host);
printf("\n x_val = %d, y_val = %d\n", x_val, y_val);

cudaFree( tempGPU );

printf( "\nCUDA: %s\n", cudaGetErrorString(cudaGetLastError()) );

return 0;

}

#包括
#包括
#包括
__全局无效添加（int*a、int*b、int*c）
{
*c=*a-*b；
}
内部主（空）
{
int result，x_val，y_val；//在此变量中存储从设备到主机的数据。
int*x_host，*y_host；//主机中的指针
int*tempGPU，*x_-dev，*y_-dev；//设备中的指针
x_host=（int*）malloc（sizeof（int））；
y_host=（int*）malloc（sizeof（int））；
*x_主机=8；
*y_主机=4；
x_val=-5；
y_val=-10；
printf（“\n x=%d，y=%d\n”，*x\u主机，*y\u主机）；
cudamaloc（（void**）和tempGPU，sizeof（int））；
//将此参数传递给函数是错误的。问题在于此语句。
添加（x_主机、y_主机、tempGPU）；
cudaMemcpy（&result，tempGPU，sizeof（int），cudaMemcpyDeviceToHost）；
printf（“\n x\u主机-y\u主机=%d\n”，结果）；
Cudamaloc（（void**）和x_dev，sizeof（int））；
Cudamaloc（（void**）和y_dev，sizeof（int））；
*x_主机=6；
*y_主机=20；
cudaMemcpy（x_dev，x_host，sizeof（int），cudamemcpyhostodevice）；
cudaMemcpy（y_dev，y_host，sizeof（int），cudamemcpyhostodevice）；
cudaMemcpy（&x_val，x_dev，sizeof（int），cudaMemcpyDeviceToHost）；
cudaMemcpy（&y_val，y_dev，sizeof（int），cudaMemcpyDeviceToHost）；
printf（“\n x\u主机=%d，y\u主机=%d\n”，*x\u主机，*y\u主机）；
printf（“\n x_val=%d，y_val=%d\n”，x_val，y_val）；
cudaFree（tempGPU）；
printf（“\nCUDA:%s\n”，cudaGetErrorString（cudaGetLastError（））；
返回0；
}

我知道函数需要在设备中分配指针，但为什么这样的错误不允许我正确使用cudaMemcpy？为什么我评论这句话：

add<<<1,1>>> (x_host, y_host, tempGPU);

add（x_主机、y_主机、tempGPU）；

我得到了正确的结果。谢谢。

您的问题是

x\u host

和

y\u host

是指向主机内存空间的指针。

\uuuu global\uuuuu\add

函数需要指向设备内存空间的指针。在构建代码时，

add

将错误地将

x\u主机

和

y\u主机

解释为设备内存指针

正如Farzad所注意到的，您可以通过正确的CUDA错误检查自己发现错误

下面是通过正确的CUDA错误检查修复的代码

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { exit(code); getchar(); }
    }
}

__global__ void add(int *a, int *b, int *c)
{
    *c = *a - *b;
}

int main(void)
{
    int* x_host = (int*)malloc(sizeof(int));
    int* y_host = (int*)malloc(sizeof(int));

    *x_host = 8;
    *y_host = 4;

    int* tempGPU;   gpuErrchk(cudaMalloc((void**)&tempGPU,sizeof(int)));
    int* x_dev;     gpuErrchk(cudaMalloc((void**)&x_dev,  sizeof(int)));
    int* y_dev;     gpuErrchk(cudaMalloc((void**)&y_dev,  sizeof(int)));

    gpuErrchk(cudaMemcpy(x_dev, x_host, sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(y_dev, y_host, sizeof(int), cudaMemcpyHostToDevice));

    int result; 

    add<<<1,1>>> (x_dev, y_dev, tempGPU);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(&result, tempGPU, sizeof(int), cudaMemcpyDeviceToHost));

    printf("\n x_host - y_host = %d\n", result);

    gpuErrchk(cudaFree(x_dev));
    gpuErrchk(cudaFree(y_dev));
    gpuErrchk(cudaFree(tempGPU));

    getchar();

    return 0;

}

#包括
#包括
#包括
#定义gpuerchk（ans）{gpuAssert（（ans），_文件_，_行__）}
内联void gpuAssert（cudaError\u t代码，char*文件，int行，bool abort=true）
{
如果（代码！=cudaSuccess）
{
fprintf（标准，“GPUassert:%s%s%d\n”，cudaGetErrorString（代码）、文件、行）；
if（abort）{exit（code）；getchar（）；}
}
}
__全局无效添加（int*a、int*b、int*c）
{
*c=*a-*b；
}
内部主（空）
{
int*x_host=（int*）malloc（sizeof（int））；
int*y_host=（int*）malloc（sizeof（int））；
*x_主机=8；
*y_主机=4；
int*tempGPU；gpuerchk（cudaMalloc（（void**）和tempGPU，sizeof（int））；
int*x_dev；gpuerchk（cudamaloc（（void**）和x_dev，sizeof（int））；
int*y_dev；gpuerchk（cudamaloc（（void**）和y_dev，sizeof（int））；
gpuerchk（cudaMemcpy（x_-dev，x_-host，sizeof（int），cudaMemcpyHostToDevice））；
gpuerchk（cudaMemcpy（y_dev，y_host，sizeof（int），cudaMemcpyHostToDevice））；
int结果；
添加（x_-dev、y_-dev、tempGPU）；
gpuerchk（cudaPeekAtLastError（））；
gpuErrchk（cudaDeviceSynchronize（））；
gpuerchk（cudaMemcpy（&result，tempGPU，sizeof（int），cudaMemcpyDeviceToHost））；
printf（“\n x\u主机-y\u主机=%d\n”，结果）；
gpuErrchk（cudaFree（x_dev））；
gpuErrchk（cudaFree（y_dev））；
gpuErrchk（cudaFree（tempGPU））；
getchar（）；
返回0；
}

您的问题是

x\u host

和

y\u host

是指向主机内存空间的指针。

\uuuu global\uuuuu\add

函数需要指向设备内存空间的指针。在构建代码时，add将错误地将

x\u主机

和

y\u主机

解释为设备内存指针。您不会检查错误。而这种不正确的推理源于此。您的

add

内核无法正常运行，但由于直到

cudaMemcpy

才捕捉到错误，因此显示仍在继续。请看一看。是的，我特意调用了

add

，其中包含指向主机内存空间的指针，比如说，看看会发生什么。因此，在不检查CUDA错误的情况下，CUDA运行时API中的错误将导致对CUDA函数的后续调用中的“禁用”或“损坏”？这就是为什么在我的代码示例中，

cudaMemcpy

给了我错误的结果？@leong你的代码在

add

函数中使用了错误的参数，结果被卡住了，之后在我的系统上没有执行

cudaMemcpy

。