分叉后CUDA初始化错误
调用fork()后,我得到“初始化错误”。如果我在没有fork的情况下运行相同的程序,那么一切都正常分叉后CUDA初始化错误,cuda,Cuda,调用fork()后,我得到“初始化错误”。如果我在没有fork的情况下运行相同的程序,那么一切都正常 if (fork() == 0) { ... cudaMalloc(....); ... } #include <stdio.h> #include <unistd.h> #include <stdlib.h> #include <sys/types.h> #include <sys/wait.h> #incl
if (fork() == 0) {
...
cudaMalloc(....);
...
}
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <cuda_runtime.h>
#define PERR(call) \
if (call) {\
fprintf(stderr, "%s:%d Error [%s] on "#call"\n", __FILE__, __LINE__,\
cudaGetErrorString(cudaGetLastError()));\
exit(1);\
}
int
main(int argc, char **argv)
{
float *v_d;
int gpucount;
cudaGetDeviceCount(&gpucount);
if (fork() == 0) {
cudaSetDevice(0);
PERR(cudaMalloc(&v_d, 1000*sizeof(float)));
}
wait(NULL);
return 0;
}
这是什么原因造成的
下面是一个完整的例子。如果我对cudaGetDeviceCount调用进行注释,它可以正常工作
if (fork() == 0) {
...
cudaMalloc(....);
...
}
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <cuda_runtime.h>
#define PERR(call) \
if (call) {\
fprintf(stderr, "%s:%d Error [%s] on "#call"\n", __FILE__, __LINE__,\
cudaGetErrorString(cudaGetLastError()));\
exit(1);\
}
int
main(int argc, char **argv)
{
float *v_d;
int gpucount;
cudaGetDeviceCount(&gpucount);
if (fork() == 0) {
cudaSetDevice(0);
PERR(cudaMalloc(&v_d, 1000*sizeof(float)));
}
wait(NULL);
return 0;
}
在本例中,我只是试图从父进程中获取可用设备的数量。这项工作的目的是:
if (fork() == 0) {
PERR(cudaGetDeviceCount(&gpucount));
return(gpucount);
}
wait(&gpucount);
gpucount = WEXITSTATUS(gpucount);
fork()
创建子进程。进程有自己的地址空间。CUDA上下文不能在两个不同的进程之间共享,原因有很多,其中一个原因是不同的指针在不同的地址空间中没有意义
如果在fork()
之前创建CUDA上下文,则不能在子进程中使用该上下文。cudaSetDevice(0)代码>调用尝试共享CUDA上下文,该上下文在调用cudaGetDeviceCount()时在父进程中隐式创建代码>
正如您所暗示的,解决方案是在父进程或子进程中执行CUDA工作。如果您在一个多设备系统中,应该可以将不同的设备分配给不同的进程(CUDA正是这样做的)。(关键是不要在fork之前创建CUDA上下文。)
您可能对和感兴趣
下面是一个完整的示例(需要2个CUDA设备),显示了使用单独GPU的子进程和父进程:
$ cat t345.cu
#include <unistd.h> /* Symbolic Constants */
#include <sys/types.h> /* Primitive System Data Types */
#include <errno.h> /* Errors */
#include <stdio.h> /* Input/Output */
#include <sys/wait.h> /* Wait for Process Termination */
#include <stdlib.h> /* General Utilities */
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void addkernel(int *data){
*data += 1;
}
int main()
{
pid_t childpid; /* variable to store the child's pid */
int retval; /* child process: user-provided return code */
int status; /* parent process: child's exit status */
/* only 1 int variable is needed because each process would have its
own instance of the variable
here, 2 int variables are used for clarity */
/* now create new process */
childpid = fork();
if (childpid >= 0) /* fork succeeded */
{
if (childpid == 0) /* fork() returns 0 to the child process */
{
printf("CHILD: I am the child process!\n");
printf("CHILD: Here's my PID: %d\n", getpid());
printf("CHILD: My parent's PID is: %d\n", getppid());
printf("CHILD: The value of my copy of childpid is: %d\n", childpid);
int *h_a, *d_a;
h_a = (int *)malloc(sizeof(int));
cudaSetDevice(0);
cudaCheckErrors("CHILD cudaSetDevice fail");
cudaMalloc(&d_a, sizeof(int));
cudaCheckErrors("cudaMalloc fail");
*h_a = 1;
cudaMemcpy(d_a, h_a, sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy H2D fail");
addkernel<<<1,1>>>(d_a);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy(h_a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy D2H fail");
printf("CHILD: result: %d\n", *h_a);
printf("CHILD: Sleeping for 1 second...\n");
sleep(1); /* sleep for 1 second */
cudaDeviceReset();
printf("CHILD: Enter an exit value (0 to 255): ");
scanf(" %d", &retval);
printf("CHILD: Goodbye!\n");
exit(retval); /* child exits with user-provided return code */
}
else /* fork() returns new pid to the parent process */
{
printf("PARENT: I am the parent process!\n");
printf("PARENT: Here's my PID: %d\n", getpid());
printf("PARENT: The value of my copy of childpid is %d\n", childpid);
printf("PARENT: I will now wait for my child to exit.\n");
int *h_a, *d_a;
h_a = (int *)malloc(sizeof(int));
cudaSetDevice(1);
cudaCheckErrors("PARENT cudaSetDevice fail");
cudaMalloc(&d_a, sizeof(int));
cudaCheckErrors("cudaMalloc fail");
*h_a = 2;
cudaMemcpy(d_a, h_a, sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy H2D fail");
addkernel<<<1,1>>>(d_a);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy(h_a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy D2H fail");
printf("PARENT: result: %d\n", *h_a);
wait(&status); /* wait for child to exit, and store its status */
printf("PARENT: Child's exit code is: %d\n", WEXITSTATUS(status));
cudaSetDevice(0);
cudaCheckErrors("PARENT cudaSetDevice 2 fail");
int *h_a2, *d_a2;
cudaMalloc(&d_a2, sizeof(int));
cudaCheckErrors("cudaMalloc fail");
h_a2 = (int *)malloc(sizeof(int));
*h_a2 = 5;
cudaMemcpy(d_a2, h_a2, sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy H2D fail");
addkernel<<<1,1>>>(d_a2);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy(h_a2, d_a2, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy D2H fail");
printf("PARENT: result2: %d\n", *h_a2);
printf("PARENT: Goodbye!\n");
exit(0); /* parent exits */
}
}
else /* fork returns -1 on failure */
{
perror("fork"); /* display error message */
exit(0);
}
}
$ nvcc -arch=sm_20 -o t345 t345.cu
$ ./t345
CHILD: I am the child process!
CHILD: Here's my PID: 23603
CHILD: My parent's PID is: 23602
CHILD: The value of my copy of childpid is: 0
PARENT: I am the parent process!
PARENT: Here's my PID: 23602
PARENT: The value of my copy of childpid is 23603
PARENT: I will now wait for my child to exit.
CHILD: result: 2
CHILD: Sleeping for 1 second...
PARENT: result: 3
CHILD: Enter an exit value (0 to 255): 10
CHILD: Goodbye!
PARENT: Child's exit code is: 10
PARENT: result2: 6
PARENT: Goodbye!
$
$cat t345.cu
#包含/*符号常量*/
#包含/*基本系统数据类型*/
#包括/*错误*/
#包括/*输入/输出*/
#包括/*等待进程终止*/
#包括/*一般公用设施*/
#定义cudaCheckErrors(msg)\
做{\
cudaError\u t\u err=cudaGetLastError()\
如果(_err!=cudaSuccess){\
fprintf(标准,“致命错误:%s(%s位于%s:%d)\n”\
msg,cudaGetErrorString(_err)\
__文件(行)\
fprintf(stderr,“***失败-中止\n”)\
出口(1)\
} \
}而(0)
__全局无效添加内核(int*数据){
*数据+=1;
}
int main()
{
pid\u t childpid;/*变量来存储子pid*/
int retval;/*子进程:用户提供的返回代码*/
int status;/*父进程:子进程的退出状态*/
/*只需要1个int变量,因为每个进程都有自己的
变量的自身实例
为了清晰起见,这里使用了两个int变量*/
/*现在创建新流程*/
childpid=fork();
如果(childpid>=0)/*分叉成功*/
{
if(childpid==0)/*fork()将0返回给子进程*/
{
printf(“孩子:我是孩子进程!\n”);
printf(“孩子:这是我的PID:%d\n”,getpid());
printf(“孩子:我父母的PID是:%d\n”,getppid());
printf(“CHILD:childpid副本的值为:%d\n”,childpid);
int*h_a,*d_a;
h_a=(int*)malloc(sizeof(int));
cudaSetDevice(0);
cudaCheckErrors(“子CUDASETDEVIEL”);
库达马洛克(d_a,sizeof(int)),;
cudaCheckErrors(“Cudamaloc失败”);
*h_a=1;
cudaMemcpy(d_a,h_a,sizeof(int),cudamemcpyhostodevice);
CUDACHECKERS(“cudaMemcpy H2D故障”);
addkernel(d_a);
cudaDeviceSynchronize();
cudaCheckErrors(“内核失败”);
cudaMemcpy(h_a,d_a,sizeof(int),cudaMemcpyDeviceToHost);
CUDACHECKERS(“cudaMemcpy D2H故障”);
printf(“子:结果:%d\n”,*h\u a);
printf(“孩子:睡1秒钟…\n”);
睡眠(1);/*睡眠1秒*/
cudaDeviceReset();
printf(“子项:输入一个退出值(0到255):”;
scanf(“%d”、&retval);
printf(“孩子:再见!\n”);
退出(retval);/*子级使用用户提供的返回代码退出*/
}
else/*fork()将新pid返回给父进程*/
{
printf(“父进程:我是父进程!\n”);
printf(“父:这是我的PID:%d\n”,getpid());
printf(“父项:我的childpid副本的值为%d\n”,childpid);
printf(“家长:我现在将等待我的孩子退出。\n”);
int*h_a,*d_a;
h_a=(int*)malloc(sizeof(int));
cudaSetDevice(1);
cudaCheckErrors(“父CUDASETDEVIEL”);
库达马洛克(d_a,sizeof(int)),;
cudaCheckErrors(“Cudamaloc失败”);
*h_a=2;
cudaMemcpy(d_a,h_a,sizeof(int),cudamemcpyhostodevice);
CUDACHECKERS(“cudaMemcpy H2D故障”);
addkernel(d_a);
cudaDeviceSynchronize();
cudaCheckErrors(“内核失败”);
cudaMemcpy(h_a,d_a,sizeof(int),cudaMemcpyDeviceToHost);
CUDACHECKERS(“cudaMemcpy D2H故障”);
printf(“父:结果:%d\n”,*h\u a);
等待(&status);/*等待子级退出,并存储其状态*/
printf(“父项:子项的退出代码为:%d\n”,WEXITSTATUS(status));
cudaSetDevice(0);
cudaCheckErrors(“父cudaSetDevice 2失败”);
int*h_a2,*d_a2;
Cudamaloc(和d_a2,sizeof(int));
cudaCheckErrors(“Cudamaloc失败”);
h_a2=(int*)malloc(sizeof(int));
*h_a2=5;
cudaMemcpy(d_a2,h_a2,sizeof(int),cudamemcpyhostodevice);
CUDACHECKERS(“cudaMemcpy H2D故障”);
addkernel(d_a2);
cudaDeviceSynchronize();
cudaCheckErrors(“内核失败”);
cudaMemcpy(h_a2,d_a2,sizeof(int),cudamemcpydevicetoost);
CUDACHECKERS(“cudaMemcpy D2H故障”);
printf(“父项:结果2:%d\n”,*h_a2);
printf(“家长:再见!\n”);
退出(0);/*父级退出*/
}
}
其他的