C++ 当设备在此过程中处于活动状态时,无法设置CUDA固定内存实现错误

C++ 当设备在此过程中处于活动状态时,无法设置CUDA固定内存实现错误,c++,cuda,C++,Cuda,我想在我的代码中实现GPU的固定内存特性。为此,我编写了如下代码: bool addVectorGPU(float* M, float* N, float* P, int size) { // Error return value cudaError_t status; cudaSetDeviceFlags(cudaDeviceMapHost); // Number of bytes in the matrix. int bytes = size * sizeof(float); // Poin

我想在我的代码中实现GPU的固定内存特性。为此,我编写了如下代码:

bool addVectorGPU(float* M, float* N, float* P, int size)
{
// Error return value
cudaError_t status;
cudaSetDeviceFlags(cudaDeviceMapHost);
// Number of bytes in the matrix.
int bytes = size * sizeof(float);
// Pointers to the device arrays
float *Md, *Nd, *Pd;
// Allocate memory on the device to store each matrix

cudaHostAlloc((void**)&M, bytes, cudaHostAllocMapped);
cudaHostAlloc((void**)&N, bytes, cudaHostAllocMapped);
cudaHostAlloc((void**)&P, bytes, cudaHostAllocMapped);
// Copy the host input data to the device

cudaHostGetDevicePointer((void**)&Md, M, 0);
cudaHostGetDevicePointer((void**)&Nd, N, 0);
cudaHostGetDevicePointer((void**)&Pd, P, 0);

// Specify the size of the grid and the size of the block
dim3 dimBlock(TILE_SIZE); // Matrix is contained in a block
dim3 dimGrid((int)ceil((float)size / (float)TILE_SIZE)); 
// Launch the kernel on a size-by-size block of threads
addVectorKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd, size);
// Wait for completion
cudaThreadSynchronize();
cudaDeviceSynchronize();
// Check for errors
status = cudaGetLastError();
if (status != cudaSuccess) {
std::cout << "Kernel failed: " << cudaGetErrorString(status) <<
std::endl;
cudaFreeHost(M);
cudaFreeHost(N);
cudaFreeHost(P);

return false;
}
// Retrieve the result matrix
//cudaHostGetDevicePointer((void**)&Pd, P, 0);
// Free device memory
cudaFreeHost(M);
cudaFreeHost(N);
cudaFreeHost(P);
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
// Success
return true;
}

有人知道这是怎么回事吗

如果您通过检查每个运行时API调用的返回值来更好地检查cuda错误,您将发现此错误是在您第二次调用此函数时返回的:

cudaSetDeviceFlags(cudaDeviceMapHost);
请注意,对于此运行时API调用:

如果当前设备已设置且该设备已初始化,则此调用将失败,并出现错误cudaErrorSetOnActiveProcess

解决方案是只在应用程序开始时调用函数一次,而不是每次调用
addVectorGPU
函数。在第一次调用
addVectorGPU
之前,将该调用从
addVectorGPU
函数中取出,并将其放入
main
例程中

基于以下问题,该准则还存在其他各种问题:

  • 我建议对所有内核调用和所有CUDAAPI调用执行适当的cuda错误检查,而不是在例程结束时执行一次

  • cudaHostAlloc
    的用法不正确。该程序的目的似乎是将主机指针传递到GPU例程的主机驻留数据,然后使用零拷贝技术添加该数据。这在技术上是可行的(尽管速度非常慢),但正确的方法是使用cudaHostAlloc,而不是
    cudaHostAlloc
    cudaHostAlloc
    创建新的分配,因此传递给函数的现有数据不会以这种方式使用或引用

  • 下面是一个工作示例,基于您所展示的内容。请注意,我个人不会以这种方式对事物进行基准测试,但我提供这一点是为了表明该过程可以以无错误的方式工作:

    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    #include <iostream>
    
    #define TILE_SIZE 512
    #define SIZE 1048576
    #define ITERS 10
    
    bool addVectorCPU(float *M, float *N, float *P, int size){
    
      for (int i=0; i< size; i++) P[i] = M[i]+N[i];
      return true;
    }
    __global__ void addVectorKernel(float *M, float *N, float *P,int  size){
    
      int idx = threadIdx.x+blockDim.x*blockIdx.x;
      if (idx < size)
        P[idx] = M[idx]+N[idx];
    }
    
    bool addVectorGPU(float* M, float* N, float* P, int size)
    {
    // Error return value
      cudaError_t status;
    // Number of bytes in the matrix.
      int bytes = size * sizeof(float);
    // Pointers to the device arrays
      float *Md, *Nd, *Pd;
    // Allocate memory on the device to store each matrix
    
      cudaHostRegister(M, bytes, cudaHostRegisterMapped);
      cudaHostRegister(N, bytes, cudaHostRegisterMapped);
      cudaHostRegister(P, bytes, cudaHostRegisterMapped);
    // Copy the host input data to the device
    
      cudaHostGetDevicePointer((void**)&Md, M, 0);
      cudaHostGetDevicePointer((void**)&Nd, N, 0);
      cudaHostGetDevicePointer((void**)&Pd, P, 0);
    
    // Specify the size of the grid and the size of the block
      dim3 dimBlock(TILE_SIZE); // Matrix is contained in a block
      dim3 dimGrid((int)ceil((float)size / (float)TILE_SIZE));
    // Launch the kernel on a size-by-size block of threads
      addVectorKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd, size);
    // Wait for completion
      cudaDeviceSynchronize();
      bool res = true;
    // Check for errors
      status = cudaGetLastError();
      if (status != cudaSuccess) {
        std::cout << "Kernel failed: " << cudaGetErrorString(status) << std::endl;
    
        res = false;
        }
    // Retrieve the result matrix
    //cudaHostGetDevicePointer((void**)&Pd, P, 0);
    // Free device memory
      cudaHostUnregister(M);
      cudaHostUnregister(N);
      cudaHostUnregister(P);
    // Success
      return res;
    }
    
    int main(){
    // Timing data
      float tcpuadd, tgpuadd;
      clock_t start, end;
      bool success;
    
    //Allocate the four vectors of SIZE floats
      float* M = new float[SIZE];
      float* N = new float[SIZE];
      float* Pcpu = new float[SIZE];
      float* Pgpu = new float[SIZE];
    //Initialize M and N to random integers
      for (int i = 0; i < SIZE; i ++){
        M[i] = rand()/(float)(RAND_MAX);
        N[i] = rand()/(float)(RAND_MAX);
        }
      printf("Operating on a vector of length %d\n", SIZE);
    //Add two vectors and compute timing in CPU
      start = clock();
      for (int i = 0; i < ITERS; i++) {
        addVectorCPU(M, N, Pcpu, SIZE);
        }
    
      end = clock();
      tcpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS;
      printf( "CPU Addition took %f ms\n", tcpuadd);
    //Add two vectors and compute timing in GPU
      cudaSetDeviceFlags(cudaDeviceMapHost);
      success = addVectorGPU(M, N ,Pgpu , SIZE);
      if(!success)
        {
        printf("Device Error!\n");
        return 1;
        }
    //compute GPU timing
      start = clock();
      for (int i = 0; i < ITERS; i++) {
        addVectorGPU(M, N, Pgpu, SIZE);
        }
      end = clock();
      tgpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS;
      printf("GPU Addition took %f ms\n", tgpuadd);
    }
    
    #包括
    #包括
    #包括
    #包括
    #定义平铺大小512
    #定义尺寸1048576
    #定义ITERS 10
    bool addVectorCPU(浮点*M、浮点*N、浮点*P、整数大小){
    对于(inti=0;istd::cout如果您通过检查每个运行时API调用的返回值来更好地检查cuda错误,您将发现此错误是在您第二次调用此函数时返回的:

    cudaSetDeviceFlags(cudaDeviceMapHost);
    
    请注意,对于此运行时API调用:

    如果当前设备已设置且该设备已初始化,则此调用将失败,并出现错误cudaErrorSetOnActiveProcess

    解决方案是只在应用程序开始时调用函数一次,而不是每次调用
    addVectorGPU
    函数。在第一次调用
    addVectorGPU
    函数之前,将该调用从
    addVectorGPU
    函数中取出,并将其放入
    main
    例程中

    基于以下问题,该准则还存在其他各种问题:

  • 我建议对所有内核调用和所有CUDAAPI调用执行适当的cuda错误检查,而不是在例程结束时执行一次

  • cudaHostAlloc
    的使用不正确。该程序的目的似乎是将主机指针传递到主机驻留数据到GPU例程,然后使用零拷贝技术添加该数据。这在技术上是可行的(尽管速度非常慢),但正确的方法是使用,而不是使用
    cudaHostAlloc
    cudaHostAlloc
    创建新的分配,因此传递给函数的现有数据不会以这种方式使用或引用

  • 以下是一个基于您所展示内容的有效示例。请注意,我个人不会以这种方式对事物进行基准测试,但我提供此示例是为了表明流程可以以无错误的方式工作:

    #include <stdio.h>
    #include <stdlib.h>
    #include <time.h>
    #include <iostream>
    
    #define TILE_SIZE 512
    #define SIZE 1048576
    #define ITERS 10
    
    bool addVectorCPU(float *M, float *N, float *P, int size){
    
      for (int i=0; i< size; i++) P[i] = M[i]+N[i];
      return true;
    }
    __global__ void addVectorKernel(float *M, float *N, float *P,int  size){
    
      int idx = threadIdx.x+blockDim.x*blockIdx.x;
      if (idx < size)
        P[idx] = M[idx]+N[idx];
    }
    
    bool addVectorGPU(float* M, float* N, float* P, int size)
    {
    // Error return value
      cudaError_t status;
    // Number of bytes in the matrix.
      int bytes = size * sizeof(float);
    // Pointers to the device arrays
      float *Md, *Nd, *Pd;
    // Allocate memory on the device to store each matrix
    
      cudaHostRegister(M, bytes, cudaHostRegisterMapped);
      cudaHostRegister(N, bytes, cudaHostRegisterMapped);
      cudaHostRegister(P, bytes, cudaHostRegisterMapped);
    // Copy the host input data to the device
    
      cudaHostGetDevicePointer((void**)&Md, M, 0);
      cudaHostGetDevicePointer((void**)&Nd, N, 0);
      cudaHostGetDevicePointer((void**)&Pd, P, 0);
    
    // Specify the size of the grid and the size of the block
      dim3 dimBlock(TILE_SIZE); // Matrix is contained in a block
      dim3 dimGrid((int)ceil((float)size / (float)TILE_SIZE));
    // Launch the kernel on a size-by-size block of threads
      addVectorKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd, size);
    // Wait for completion
      cudaDeviceSynchronize();
      bool res = true;
    // Check for errors
      status = cudaGetLastError();
      if (status != cudaSuccess) {
        std::cout << "Kernel failed: " << cudaGetErrorString(status) << std::endl;
    
        res = false;
        }
    // Retrieve the result matrix
    //cudaHostGetDevicePointer((void**)&Pd, P, 0);
    // Free device memory
      cudaHostUnregister(M);
      cudaHostUnregister(N);
      cudaHostUnregister(P);
    // Success
      return res;
    }
    
    int main(){
    // Timing data
      float tcpuadd, tgpuadd;
      clock_t start, end;
      bool success;
    
    //Allocate the four vectors of SIZE floats
      float* M = new float[SIZE];
      float* N = new float[SIZE];
      float* Pcpu = new float[SIZE];
      float* Pgpu = new float[SIZE];
    //Initialize M and N to random integers
      for (int i = 0; i < SIZE; i ++){
        M[i] = rand()/(float)(RAND_MAX);
        N[i] = rand()/(float)(RAND_MAX);
        }
      printf("Operating on a vector of length %d\n", SIZE);
    //Add two vectors and compute timing in CPU
      start = clock();
      for (int i = 0; i < ITERS; i++) {
        addVectorCPU(M, N, Pcpu, SIZE);
        }
    
      end = clock();
      tcpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS;
      printf( "CPU Addition took %f ms\n", tcpuadd);
    //Add two vectors and compute timing in GPU
      cudaSetDeviceFlags(cudaDeviceMapHost);
      success = addVectorGPU(M, N ,Pgpu , SIZE);
      if(!success)
        {
        printf("Device Error!\n");
        return 1;
        }
    //compute GPU timing
      start = clock();
      for (int i = 0; i < ITERS; i++) {
        addVectorGPU(M, N, Pgpu, SIZE);
        }
      end = clock();
      tgpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS;
      printf("GPU Addition took %f ms\n", tgpuadd);
    }
    
    #包括
    #包括
    #包括
    #包括
    #定义平铺大小512
    #定义尺寸1048576
    #定义ITERS 10
    bool addVectorCPU(浮点*M、浮点*N、浮点*P、整数大小){
    对于(inti=0;i