C++ 当设备在此过程中处于活动状态时，无法设置CUDA固定内存实现错误_C++_Cuda

C++ 当设备在此过程中处于活动状态时，无法设置CUDA固定内存实现错误

c++ cuda

C++ 当设备在此过程中处于活动状态时，无法设置CUDA固定内存实现错误,c++,cuda,C++,Cuda,我想在我的代码中实现GPU的固定内存特性。为此，我编写了如下代码： bool addVectorGPU(float* M, float* N, float* P, int size) { // Error return value cudaError_t status; cudaSetDeviceFlags(cudaDeviceMapHost); // Number of bytes in the matrix. int bytes = size * sizeof(float); // Poin

我想在我的代码中实现GPU的固定内存特性。为此，我编写了如下代码：

bool addVectorGPU(float* M, float* N, float* P, int size)
{
// Error return value
cudaError_t status;
cudaSetDeviceFlags(cudaDeviceMapHost);
// Number of bytes in the matrix.
int bytes = size * sizeof(float);
// Pointers to the device arrays
float *Md, *Nd, *Pd;
// Allocate memory on the device to store each matrix

cudaHostAlloc((void**)&M, bytes, cudaHostAllocMapped);
cudaHostAlloc((void**)&N, bytes, cudaHostAllocMapped);
cudaHostAlloc((void**)&P, bytes, cudaHostAllocMapped);
// Copy the host input data to the device

cudaHostGetDevicePointer((void**)&Md, M, 0);
cudaHostGetDevicePointer((void**)&Nd, N, 0);
cudaHostGetDevicePointer((void**)&Pd, P, 0);

// Specify the size of the grid and the size of the block
dim3 dimBlock(TILE_SIZE); // Matrix is contained in a block
dim3 dimGrid((int)ceil((float)size / (float)TILE_SIZE)); 
// Launch the kernel on a size-by-size block of threads
addVectorKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd, size);
// Wait for completion
cudaThreadSynchronize();
cudaDeviceSynchronize();
// Check for errors
status = cudaGetLastError();
if (status != cudaSuccess) {
std::cout << "Kernel failed: " << cudaGetErrorString(status) <<
std::endl;
cudaFreeHost(M);
cudaFreeHost(N);
cudaFreeHost(P);

return false;
}
// Retrieve the result matrix
//cudaHostGetDevicePointer((void**)&Pd, P, 0);
// Free device memory
cudaFreeHost(M);
cudaFreeHost(N);
cudaFreeHost(P);
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
// Success
return true;
}

有人知道这是怎么回事吗

如果您通过检查每个运行时API调用的返回值来更好地检查cuda错误，您将发现此错误是在您第二次调用此函数时返回的：

cudaSetDeviceFlags(cudaDeviceMapHost);

请注意，对于此运行时API调用：

如果当前设备已设置且该设备已初始化，则此调用将失败，并出现错误cudaErrorSetOnActiveProcess

解决方案是只在应用程序开始时调用函数一次，而不是每次调用

addVectorGPU

函数。在第一次调用

addVectorGPU

之前，将该调用从

addVectorGPU

函数中取出，并将其放入

main

例程中

基于以下问题，该准则还存在其他各种问题：

我建议对所有内核调用和所有CUDAAPI调用执行适当的cuda错误检查，而不是在例程结束时执行一次

cudaHostAlloc

的用法不正确。该程序的目的似乎是将主机指针传递到GPU例程的主机驻留数据，然后使用零拷贝技术添加该数据。这在技术上是可行的（尽管速度非常慢），但正确的方法是使用cudaHostAlloc，而不是

cudaHostAlloc

cudaHostAlloc

创建新的分配，因此传递给函数的现有数据不会以这种方式使用或引用

下面是一个工作示例，基于您所展示的内容。请注意，我个人不会以这种方式对事物进行基准测试，但我提供这一点是为了表明该过程可以以无错误的方式工作：

#include <stdio.h> #include <stdlib.h> #include <time.h> #include <iostream> #define TILE_SIZE 512 #define SIZE 1048576 #define ITERS 10 bool addVectorCPU(float *M, float *N, float *P, int size){ for (int i=0; i< size; i++) P[i] = M[i]+N[i]; return true; } __global__ void addVectorKernel(float *M, float *N, float *P,int size){ int idx = threadIdx.x+blockDim.x*blockIdx.x; if (idx < size) P[idx] = M[idx]+N[idx]; } bool addVectorGPU(float* M, float* N, float* P, int size) { // Error return value cudaError_t status; // Number of bytes in the matrix. int bytes = size * sizeof(float); // Pointers to the device arrays float *Md, *Nd, *Pd; // Allocate memory on the device to store each matrix cudaHostRegister(M, bytes, cudaHostRegisterMapped); cudaHostRegister(N, bytes, cudaHostRegisterMapped); cudaHostRegister(P, bytes, cudaHostRegisterMapped); // Copy the host input data to the device cudaHostGetDevicePointer((void**)&Md, M, 0); cudaHostGetDevicePointer((void**)&Nd, N, 0); cudaHostGetDevicePointer((void**)&Pd, P, 0); // Specify the size of the grid and the size of the block dim3 dimBlock(TILE_SIZE); // Matrix is contained in a block dim3 dimGrid((int)ceil((float)size / (float)TILE_SIZE)); // Launch the kernel on a size-by-size block of threads addVectorKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd, size); // Wait for completion cudaDeviceSynchronize(); bool res = true; // Check for errors status = cudaGetLastError(); if (status != cudaSuccess) { std::cout << "Kernel failed: " << cudaGetErrorString(status) << std::endl; res = false; } // Retrieve the result matrix //cudaHostGetDevicePointer((void**)&Pd, P, 0); // Free device memory cudaHostUnregister(M); cudaHostUnregister(N); cudaHostUnregister(P); // Success return res; } int main(){ // Timing data float tcpuadd, tgpuadd; clock_t start, end; bool success; //Allocate the four vectors of SIZE floats float* M = new float[SIZE]; float* N = new float[SIZE]; float* Pcpu = new float[SIZE]; float* Pgpu = new float[SIZE]; //Initialize M and N to random integers for (int i = 0; i < SIZE; i ++){ M[i] = rand()/(float)(RAND_MAX); N[i] = rand()/(float)(RAND_MAX); } printf("Operating on a vector of length %d\n", SIZE); //Add two vectors and compute timing in CPU start = clock(); for (int i = 0; i < ITERS; i++) { addVectorCPU(M, N, Pcpu, SIZE); } end = clock(); tcpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS; printf( "CPU Addition took %f ms\n", tcpuadd); //Add two vectors and compute timing in GPU cudaSetDeviceFlags(cudaDeviceMapHost); success = addVectorGPU(M, N ,Pgpu , SIZE); if(!success) { printf("Device Error!\n"); return 1; } //compute GPU timing start = clock(); for (int i = 0; i < ITERS; i++) { addVectorGPU(M, N, Pgpu, SIZE); } end = clock(); tgpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS; printf("GPU Addition took %f ms\n", tgpuadd); }

#包括 #包括 #包括 #包括 #定义平铺大小512 #定义尺寸1048576 #定义ITERS 10 bool addVectorCPU（浮点*M、浮点*N、浮点*P、整数大小）{ 对于（inti=0；istd:：cout如果您通过检查每个运行时API调用的返回值来更好地检查cuda错误，您将发现此错误是在您第二次调用此函数时返回的： cudaSetDeviceFlags(cudaDeviceMapHost); 请注意，对于此运行时API调用：如果当前设备已设置且该设备已初始化，则此调用将失败，并出现错误cudaErrorSetOnActiveProcess 解决方案是只在应用程序开始时调用函数一次，而不是每次调用addVectorGPU 函数。在第一次调用addVectorGPU 函数之前，将该调用从addVectorGPU 函数中取出，并将其放入main 例程中基于以下问题，该准则还存在其他各种问题：我建议对所有内核调用和所有CUDAAPI调用执行适当的cuda错误检查，而不是在例程结束时执行一次 cudaHostAlloc 的使用不正确。该程序的目的似乎是将主机指针传递到主机驻留数据到GPU例程，然后使用零拷贝技术添加该数据。这在技术上是可行的（尽管速度非常慢），但正确的方法是使用，而不是使用cudaHostAlloc cudaHostAlloc 创建新的分配，因此传递给函数的现有数据不会以这种方式使用或引用以下是一个基于您所展示内容的有效示例。请注意，我个人不会以这种方式对事物进行基准测试，但我提供此示例是为了表明流程可以以无错误的方式工作： #include <stdio.h> #include <stdlib.h> #include <time.h> #include <iostream> #define TILE_SIZE 512 #define SIZE 1048576 #define ITERS 10 bool addVectorCPU(float *M, float *N, float *P, int size){ for (int i=0; i< size; i++) P[i] = M[i]+N[i]; return true; } __global__ void addVectorKernel(float *M, float *N, float *P,int size){ int idx = threadIdx.x+blockDim.x*blockIdx.x; if (idx < size) P[idx] = M[idx]+N[idx]; } bool addVectorGPU(float* M, float* N, float* P, int size) { // Error return value cudaError_t status; // Number of bytes in the matrix. int bytes = size * sizeof(float); // Pointers to the device arrays float *Md, *Nd, *Pd; // Allocate memory on the device to store each matrix cudaHostRegister(M, bytes, cudaHostRegisterMapped); cudaHostRegister(N, bytes, cudaHostRegisterMapped); cudaHostRegister(P, bytes, cudaHostRegisterMapped); // Copy the host input data to the device cudaHostGetDevicePointer((void**)&Md, M, 0); cudaHostGetDevicePointer((void**)&Nd, N, 0); cudaHostGetDevicePointer((void**)&Pd, P, 0); // Specify the size of the grid and the size of the block dim3 dimBlock(TILE_SIZE); // Matrix is contained in a block dim3 dimGrid((int)ceil((float)size / (float)TILE_SIZE)); // Launch the kernel on a size-by-size block of threads addVectorKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd, size); // Wait for completion cudaDeviceSynchronize(); bool res = true; // Check for errors status = cudaGetLastError(); if (status != cudaSuccess) { std::cout << "Kernel failed: " << cudaGetErrorString(status) << std::endl; res = false; } // Retrieve the result matrix //cudaHostGetDevicePointer((void**)&Pd, P, 0); // Free device memory cudaHostUnregister(M); cudaHostUnregister(N); cudaHostUnregister(P); // Success return res; } int main(){ // Timing data float tcpuadd, tgpuadd; clock_t start, end; bool success; //Allocate the four vectors of SIZE floats float* M = new float[SIZE]; float* N = new float[SIZE]; float* Pcpu = new float[SIZE]; float* Pgpu = new float[SIZE]; //Initialize M and N to random integers for (int i = 0; i < SIZE; i ++){ M[i] = rand()/(float)(RAND_MAX); N[i] = rand()/(float)(RAND_MAX); } printf("Operating on a vector of length %d\n", SIZE); //Add two vectors and compute timing in CPU start = clock(); for (int i = 0; i < ITERS; i++) { addVectorCPU(M, N, Pcpu, SIZE); } end = clock(); tcpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS; printf( "CPU Addition took %f ms\n", tcpuadd); //Add two vectors and compute timing in GPU cudaSetDeviceFlags(cudaDeviceMapHost); success = addVectorGPU(M, N ,Pgpu , SIZE); if(!success) { printf("Device Error!\n"); return 1; } //compute GPU timing start = clock(); for (int i = 0; i < ITERS; i++) { addVectorGPU(M, N, Pgpu, SIZE); } end = clock(); tgpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS; printf("GPU Addition took %f ms\n", tgpuadd); } #包括 #包括 #包括 #包括 #定义平铺大小512 #定义尺寸1048576 #定义ITERS 10 bool addVectorCPU（浮点*M、浮点*N、浮点*P、整数大小）{ 对于（inti=0；i