分叉后CUDA初始化错误_Cuda - Fatal编程技术网

分叉后CUDA初始化错误

cuda

分叉后CUDA初始化错误,cuda,Cuda,调用fork（）后，我得到“初始化错误”。如果我在没有fork的情况下运行相同的程序，那么一切都正常 if (fork() == 0) { ... cudaMalloc(....); ... } #include <stdio.h> #include <unistd.h> #include <stdlib.h> #include <sys/types.h> #include <sys/wait.h> #incl

调用fork（）后，我得到“初始化错误”。如果我在没有fork的情况下运行相同的程序，那么一切都正常

if (fork() == 0) {
    ...
    cudaMalloc(....);
    ...
}

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <cuda_runtime.h>

#define PERR(call) \
  if (call) {\
   fprintf(stderr, "%s:%d Error [%s] on "#call"\n", __FILE__, __LINE__,\
      cudaGetErrorString(cudaGetLastError()));\
   exit(1);\
  }

int
main(int argc, char **argv)
{
  float *v_d;
  int gpucount;

  cudaGetDeviceCount(&gpucount);

  if (fork() == 0) {
    cudaSetDevice(0);
    PERR(cudaMalloc(&v_d, 1000*sizeof(float)));
  }
  wait(NULL);
  return 0;
}

这是什么原因造成的

下面是一个完整的例子。如果我对cudaGetDeviceCount调用进行注释，它可以正常工作

if (fork() == 0) {
    ...
    cudaMalloc(....);
    ...
}

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <cuda_runtime.h>

#define PERR(call) \
  if (call) {\
   fprintf(stderr, "%s:%d Error [%s] on "#call"\n", __FILE__, __LINE__,\
      cudaGetErrorString(cudaGetLastError()));\
   exit(1);\
  }

int
main(int argc, char **argv)
{
  float *v_d;
  int gpucount;

  cudaGetDeviceCount(&gpucount);

  if (fork() == 0) {
    cudaSetDevice(0);
    PERR(cudaMalloc(&v_d, 1000*sizeof(float)));
  }
  wait(NULL);
  return 0;
}

在本例中，我只是试图从父进程中获取可用设备的数量。这项工作的目的是：

  if (fork() == 0) {
    PERR(cudaGetDeviceCount(&gpucount));
    return(gpucount);
  }
  wait(&gpucount);
  gpucount =  WEXITSTATUS(gpucount);

fork（）

创建子进程。进程有自己的地址空间。CUDA上下文不能在两个不同的进程之间共享，原因有很多，其中一个原因是不同的指针在不同的地址空间中没有意义

如果在

fork（）

之前创建CUDA上下文，则不能在子进程中使用该上下文。

cudaSetDevice（0）调用尝试共享CUDA上下文，该上下文在调用cudaGetDeviceCount（）时在父进程中隐式创建
正如您所暗示的，解决方案是在父进程或子进程中执行CUDA工作。如果您在一个多设备系统中，应该可以将不同的设备分配给不同的进程（CUDA正是这样做的）。（关键是不要在fork之前创建CUDA上下文。）
您可能对和感兴趣
下面是一个完整的示例（需要2个CUDA设备），显示了使用单独GPU的子进程和父进程：
$ cat t345.cu
#include <unistd.h>     /* Symbolic Constants */
#include <sys/types.h>  /* Primitive System Data Types */
#include <errno.h>      /* Errors */
#include <stdio.h>      /* Input/Output */
#include <sys/wait.h>   /* Wait for Process Termination */
#include <stdlib.h>     /* General Utilities */


#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


__global__ void addkernel(int *data){
  *data += 1;
}

int main()
{
    pid_t childpid; /* variable to store the child's pid */
    int retval;     /* child process: user-provided return code */
    int status;     /* parent process: child's exit status */

    /* only 1 int variable is needed because each process would have its
       own instance of the variable
       here, 2 int variables are used for clarity */

    /* now create new process */
    childpid = fork();

    if (childpid >= 0) /* fork succeeded */
    {
        if (childpid == 0) /* fork() returns 0 to the child process */
        {
            printf("CHILD: I am the child process!\n");
            printf("CHILD: Here's my PID: %d\n", getpid());
            printf("CHILD: My parent's PID is: %d\n", getppid());
            printf("CHILD: The value of my copy of childpid is: %d\n", childpid);
            int *h_a, *d_a;
            h_a = (int *)malloc(sizeof(int));
            cudaSetDevice(0);
            cudaCheckErrors("CHILD cudaSetDevice fail");
            cudaMalloc(&d_a, sizeof(int));
            cudaCheckErrors("cudaMalloc fail");
            *h_a = 1;
            cudaMemcpy(d_a, h_a, sizeof(int), cudaMemcpyHostToDevice);
            cudaCheckErrors("cudaMemcpy H2D fail");
            addkernel<<<1,1>>>(d_a);
            cudaDeviceSynchronize();
            cudaCheckErrors("kernel fail");
            cudaMemcpy(h_a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
            cudaCheckErrors("cudaMemcpy D2H fail");
            printf("CHILD: result: %d\n", *h_a);

            printf("CHILD: Sleeping for 1 second...\n");
            sleep(1); /* sleep for 1 second */
            cudaDeviceReset();
            printf("CHILD: Enter an exit value (0 to 255): ");
            scanf(" %d", &retval);
            printf("CHILD: Goodbye!\n");
            exit(retval); /* child exits with user-provided return code */
        }
        else /* fork() returns new pid to the parent process */
        {
            printf("PARENT: I am the parent process!\n");
            printf("PARENT: Here's my PID: %d\n", getpid());
            printf("PARENT: The value of my copy of childpid is %d\n", childpid);
            printf("PARENT: I will now wait for my child to exit.\n");
            int *h_a, *d_a;
            h_a = (int *)malloc(sizeof(int));
            cudaSetDevice(1);
            cudaCheckErrors("PARENT cudaSetDevice fail");
            cudaMalloc(&d_a, sizeof(int));
            cudaCheckErrors("cudaMalloc fail");
            *h_a = 2;
            cudaMemcpy(d_a, h_a, sizeof(int), cudaMemcpyHostToDevice);
            cudaCheckErrors("cudaMemcpy H2D fail");
            addkernel<<<1,1>>>(d_a);
            cudaDeviceSynchronize();
            cudaCheckErrors("kernel fail");
            cudaMemcpy(h_a, d_a, sizeof(int), cudaMemcpyDeviceToHost);
            cudaCheckErrors("cudaMemcpy D2H fail");
            printf("PARENT: result: %d\n", *h_a);
            wait(&status); /* wait for child to exit, and store its status */
            printf("PARENT: Child's exit code is: %d\n", WEXITSTATUS(status));
            cudaSetDevice(0);
            cudaCheckErrors("PARENT cudaSetDevice  2 fail");
            int *h_a2, *d_a2;
            cudaMalloc(&d_a2, sizeof(int));
            cudaCheckErrors("cudaMalloc fail");
            h_a2 = (int *)malloc(sizeof(int));
            *h_a2 = 5;
            cudaMemcpy(d_a2, h_a2, sizeof(int), cudaMemcpyHostToDevice);
            cudaCheckErrors("cudaMemcpy H2D fail");
            addkernel<<<1,1>>>(d_a2);
            cudaDeviceSynchronize();
            cudaCheckErrors("kernel fail");
            cudaMemcpy(h_a2, d_a2, sizeof(int), cudaMemcpyDeviceToHost);
            cudaCheckErrors("cudaMemcpy D2H fail");
            printf("PARENT: result2: %d\n", *h_a2);
            printf("PARENT: Goodbye!\n");
            exit(0);  /* parent exits */
        }
    }
    else /* fork returns -1 on failure */
    {
        perror("fork"); /* display error message */
        exit(0);
    }
}
$ nvcc -arch=sm_20 -o t345 t345.cu
$ ./t345
CHILD: I am the child process!
CHILD: Here's my PID: 23603
CHILD: My parent's PID is: 23602
CHILD: The value of my copy of childpid is: 0
PARENT: I am the parent process!
PARENT: Here's my PID: 23602
PARENT: The value of my copy of childpid is 23603
PARENT: I will now wait for my child to exit.
CHILD: result: 2
CHILD: Sleeping for 1 second...
PARENT: result: 3
CHILD: Enter an exit value (0 to 255): 10
CHILD: Goodbye!
PARENT: Child's exit code is: 10
PARENT: result2: 6
PARENT: Goodbye!
$

$cat t345.cu
#包含/*符号常量*/
#包含/*基本系统数据类型*/
#包括/*错误*/
#包括/*输入/输出*/
#包括/*等待进程终止*/
#包括/*一般公用设施*/
#定义cudaCheckErrors（msg）\
做{\
cudaError\u t\u err=cudaGetLastError（）\
如果（_err！=cudaSuccess）{\
fprintf（标准，“致命错误：%s（%s位于%s:%d）\n”\
msg，cudaGetErrorString（_err）\
__文件（行）\
fprintf（stderr，“***失败-中止\n”）\
出口（1）\
} \
}而（0）
__全局无效添加内核（int*数据）{
*数据+=1；
}
int main（）
{
pid\u t childpid；/*变量来存储子pid*/
int retval；/*子进程：用户提供的返回代码*/
int status；/*父进程：子进程的退出状态*/
/*只需要1个int变量，因为每个进程都有自己的
变量的自身实例
为了清晰起见，这里使用了两个int变量*/
/*现在创建新流程*/
childpid=fork（）；
如果（childpid>=0）/*分叉成功*/
{
if（childpid==0）/*fork（）将0返回给子进程*/
{
printf（“孩子：我是孩子进程！\n”）；
printf（“孩子：这是我的PID:%d\n”，getpid（））；
printf（“孩子：我父母的PID是：%d\n”，getppid（））；
printf（“CHILD:childpid副本的值为：%d\n”，childpid）；
int*h_a，*d_a；
h_a=（int*）malloc（sizeof（int））；
cudaSetDevice（0）；
cudaCheckErrors（“子CUDASETDEVIEL”）；
库达马洛克(d_a,sizeof(int)),；
cudaCheckErrors（“Cudamaloc失败”）；
*h_a=1；
cudaMemcpy（d_a，h_a，sizeof（int），cudamemcpyhostodevice）；
CUDACHECKERS（“cudaMemcpy H2D故障”）；
addkernel（d_a）；
cudaDeviceSynchronize（）；
cudaCheckErrors（“内核失败”）；
cudaMemcpy（h_a，d_a，sizeof（int），cudaMemcpyDeviceToHost）；
CUDACHECKERS（“cudaMemcpy D2H故障”）；
printf（“子：结果：%d\n”，*h\u a）；
printf（“孩子：睡1秒钟…\n”）；
睡眠（1）；/*睡眠1秒*/
cudaDeviceReset（）；
printf（“子项：输入一个退出值（0到255）：”；
scanf（“%d”、&retval）；
printf（“孩子：再见！\n”）；
退出（retval）；/*子级使用用户提供的返回代码退出*/
}
else/*fork（）将新pid返回给父进程*/
{
printf（“父进程：我是父进程！\n”）；
printf（“父：这是我的PID:%d\n”，getpid（））；
printf（“父项：我的childpid副本的值为%d\n”，childpid）；
printf（“家长：我现在将等待我的孩子退出。\n”）；
int*h_a，*d_a；
h_a=（int*）malloc（sizeof（int））；
cudaSetDevice（1）；
cudaCheckErrors（“父CUDASETDEVIEL”）；
库达马洛克(d_a,sizeof(int)),；
cudaCheckErrors（“Cudamaloc失败”）；
*h_a=2；
cudaMemcpy（d_a，h_a，sizeof（int），cudamemcpyhostodevice）；
CUDACHECKERS（“cudaMemcpy H2D故障”）；
addkernel（d_a）；
cudaDeviceSynchronize（）；
cudaCheckErrors（“内核失败”）；
cudaMemcpy（h_a，d_a，sizeof（int），cudaMemcpyDeviceToHost）；
CUDACHECKERS（“cudaMemcpy D2H故障”）；
printf（“父：结果：%d\n”，*h\u a）；
等待（&status）；/*等待子级退出，并存储其状态*/
printf（“父项：子项的退出代码为：%d\n”，WEXITSTATUS（status））；
cudaSetDevice（0）；
cudaCheckErrors（“父cudaSetDevice 2失败”）；
int*h_a2，*d_a2；
Cudamaloc（和d_a2，sizeof（int））；
cudaCheckErrors（“Cudamaloc失败”）；
h_a2=（int*）malloc（sizeof（int））；
*h_a2=5；
cudaMemcpy（d_a2，h_a2，sizeof（int），cudamemcpyhostodevice）；
CUDACHECKERS（“cudaMemcpy H2D故障”）；
addkernel（d_a2）；
cudaDeviceSynchronize（）；
cudaCheckErrors（“内核失败”）；
cudaMemcpy（h_a2，d_a2，sizeof（int），cudamemcpydevicetoost）；
CUDACHECKERS（“cudaMemcpy D2H故障”）；
printf（“父项：结果2:%d\n”，*h_a2）；
printf（“家长：再见！\n”）；
退出（0）；/*父级退出*/
}
}
其他的