C++ 当设备在此过程中处于活动状态时,无法设置CUDA固定内存实现错误
我想在我的代码中实现GPU的固定内存特性。为此,我编写了如下代码:C++ 当设备在此过程中处于活动状态时,无法设置CUDA固定内存实现错误,c++,cuda,C++,Cuda,我想在我的代码中实现GPU的固定内存特性。为此,我编写了如下代码: bool addVectorGPU(float* M, float* N, float* P, int size) { // Error return value cudaError_t status; cudaSetDeviceFlags(cudaDeviceMapHost); // Number of bytes in the matrix. int bytes = size * sizeof(float); // Poin
bool addVectorGPU(float* M, float* N, float* P, int size)
{
// Error return value
cudaError_t status;
cudaSetDeviceFlags(cudaDeviceMapHost);
// Number of bytes in the matrix.
int bytes = size * sizeof(float);
// Pointers to the device arrays
float *Md, *Nd, *Pd;
// Allocate memory on the device to store each matrix
cudaHostAlloc((void**)&M, bytes, cudaHostAllocMapped);
cudaHostAlloc((void**)&N, bytes, cudaHostAllocMapped);
cudaHostAlloc((void**)&P, bytes, cudaHostAllocMapped);
// Copy the host input data to the device
cudaHostGetDevicePointer((void**)&Md, M, 0);
cudaHostGetDevicePointer((void**)&Nd, N, 0);
cudaHostGetDevicePointer((void**)&Pd, P, 0);
// Specify the size of the grid and the size of the block
dim3 dimBlock(TILE_SIZE); // Matrix is contained in a block
dim3 dimGrid((int)ceil((float)size / (float)TILE_SIZE));
// Launch the kernel on a size-by-size block of threads
addVectorKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd, size);
// Wait for completion
cudaThreadSynchronize();
cudaDeviceSynchronize();
// Check for errors
status = cudaGetLastError();
if (status != cudaSuccess) {
std::cout << "Kernel failed: " << cudaGetErrorString(status) <<
std::endl;
cudaFreeHost(M);
cudaFreeHost(N);
cudaFreeHost(P);
return false;
}
// Retrieve the result matrix
//cudaHostGetDevicePointer((void**)&Pd, P, 0);
// Free device memory
cudaFreeHost(M);
cudaFreeHost(N);
cudaFreeHost(P);
cudaFree(Md);
cudaFree(Nd);
cudaFree(Pd);
// Success
return true;
}
有人知道这是怎么回事吗 如果您通过检查每个运行时API调用的返回值来更好地检查cuda错误,您将发现此错误是在您第二次调用此函数时返回的:
cudaSetDeviceFlags(cudaDeviceMapHost);
请注意,对于此运行时API调用:
如果当前设备已设置且该设备已初始化,则此调用将失败,并出现错误cudaErrorSetOnActiveProcess
解决方案是只在应用程序开始时调用函数一次,而不是每次调用addVectorGPU
函数。在第一次调用addVectorGPU
之前,将该调用从addVectorGPU
函数中取出,并将其放入main
例程中
基于以下问题,该准则还存在其他各种问题:
cudaHostAlloc
的用法不正确。该程序的目的似乎是将主机指针传递到GPU例程的主机驻留数据,然后使用零拷贝技术添加该数据。这在技术上是可行的(尽管速度非常慢),但正确的方法是使用cudaHostAlloc,而不是cudaHostAlloc
cudaHostAlloc
创建新的分配,因此传递给函数的现有数据不会以这种方式使用或引用#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <iostream>
#define TILE_SIZE 512
#define SIZE 1048576
#define ITERS 10
bool addVectorCPU(float *M, float *N, float *P, int size){
for (int i=0; i< size; i++) P[i] = M[i]+N[i];
return true;
}
__global__ void addVectorKernel(float *M, float *N, float *P,int size){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < size)
P[idx] = M[idx]+N[idx];
}
bool addVectorGPU(float* M, float* N, float* P, int size)
{
// Error return value
cudaError_t status;
// Number of bytes in the matrix.
int bytes = size * sizeof(float);
// Pointers to the device arrays
float *Md, *Nd, *Pd;
// Allocate memory on the device to store each matrix
cudaHostRegister(M, bytes, cudaHostRegisterMapped);
cudaHostRegister(N, bytes, cudaHostRegisterMapped);
cudaHostRegister(P, bytes, cudaHostRegisterMapped);
// Copy the host input data to the device
cudaHostGetDevicePointer((void**)&Md, M, 0);
cudaHostGetDevicePointer((void**)&Nd, N, 0);
cudaHostGetDevicePointer((void**)&Pd, P, 0);
// Specify the size of the grid and the size of the block
dim3 dimBlock(TILE_SIZE); // Matrix is contained in a block
dim3 dimGrid((int)ceil((float)size / (float)TILE_SIZE));
// Launch the kernel on a size-by-size block of threads
addVectorKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd, size);
// Wait for completion
cudaDeviceSynchronize();
bool res = true;
// Check for errors
status = cudaGetLastError();
if (status != cudaSuccess) {
std::cout << "Kernel failed: " << cudaGetErrorString(status) << std::endl;
res = false;
}
// Retrieve the result matrix
//cudaHostGetDevicePointer((void**)&Pd, P, 0);
// Free device memory
cudaHostUnregister(M);
cudaHostUnregister(N);
cudaHostUnregister(P);
// Success
return res;
}
int main(){
// Timing data
float tcpuadd, tgpuadd;
clock_t start, end;
bool success;
//Allocate the four vectors of SIZE floats
float* M = new float[SIZE];
float* N = new float[SIZE];
float* Pcpu = new float[SIZE];
float* Pgpu = new float[SIZE];
//Initialize M and N to random integers
for (int i = 0; i < SIZE; i ++){
M[i] = rand()/(float)(RAND_MAX);
N[i] = rand()/(float)(RAND_MAX);
}
printf("Operating on a vector of length %d\n", SIZE);
//Add two vectors and compute timing in CPU
start = clock();
for (int i = 0; i < ITERS; i++) {
addVectorCPU(M, N, Pcpu, SIZE);
}
end = clock();
tcpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS;
printf( "CPU Addition took %f ms\n", tcpuadd);
//Add two vectors and compute timing in GPU
cudaSetDeviceFlags(cudaDeviceMapHost);
success = addVectorGPU(M, N ,Pgpu , SIZE);
if(!success)
{
printf("Device Error!\n");
return 1;
}
//compute GPU timing
start = clock();
for (int i = 0; i < ITERS; i++) {
addVectorGPU(M, N, Pgpu, SIZE);
}
end = clock();
tgpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS;
printf("GPU Addition took %f ms\n", tgpuadd);
}
#包括
#包括
#包括
#包括
#定义平铺大小512
#定义尺寸1048576
#定义ITERS 10
bool addVectorCPU(浮点*M、浮点*N、浮点*P、整数大小){
对于(inti=0;i std::cout如果您通过检查每个运行时API调用的返回值来更好地检查cuda错误,您将发现此错误是在您第二次调用此函数时返回的:
cudaSetDeviceFlags(cudaDeviceMapHost);
请注意,对于此运行时API调用:
如果当前设备已设置且该设备已初始化,则此调用将失败,并出现错误cudaErrorSetOnActiveProcess
解决方案是只在应用程序开始时调用函数一次,而不是每次调用addVectorGPU
函数。在第一次调用addVectorGPU
函数之前,将该调用从addVectorGPU
函数中取出,并将其放入main
例程中
基于以下问题,该准则还存在其他各种问题:
我建议对所有内核调用和所有CUDAAPI调用执行适当的cuda错误检查,而不是在例程结束时执行一次
cudaHostAlloc
的使用不正确。该程序的目的似乎是将主机指针传递到主机驻留数据到GPU例程,然后使用零拷贝技术添加该数据。这在技术上是可行的(尽管速度非常慢),但正确的方法是使用,而不是使用cudaHostAlloc
cudaHostAlloc
创建新的分配,因此传递给函数的现有数据不会以这种方式使用或引用
以下是一个基于您所展示内容的有效示例。请注意,我个人不会以这种方式对事物进行基准测试,但我提供此示例是为了表明流程可以以无错误的方式工作:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <iostream>
#define TILE_SIZE 512
#define SIZE 1048576
#define ITERS 10
bool addVectorCPU(float *M, float *N, float *P, int size){
for (int i=0; i< size; i++) P[i] = M[i]+N[i];
return true;
}
__global__ void addVectorKernel(float *M, float *N, float *P,int size){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < size)
P[idx] = M[idx]+N[idx];
}
bool addVectorGPU(float* M, float* N, float* P, int size)
{
// Error return value
cudaError_t status;
// Number of bytes in the matrix.
int bytes = size * sizeof(float);
// Pointers to the device arrays
float *Md, *Nd, *Pd;
// Allocate memory on the device to store each matrix
cudaHostRegister(M, bytes, cudaHostRegisterMapped);
cudaHostRegister(N, bytes, cudaHostRegisterMapped);
cudaHostRegister(P, bytes, cudaHostRegisterMapped);
// Copy the host input data to the device
cudaHostGetDevicePointer((void**)&Md, M, 0);
cudaHostGetDevicePointer((void**)&Nd, N, 0);
cudaHostGetDevicePointer((void**)&Pd, P, 0);
// Specify the size of the grid and the size of the block
dim3 dimBlock(TILE_SIZE); // Matrix is contained in a block
dim3 dimGrid((int)ceil((float)size / (float)TILE_SIZE));
// Launch the kernel on a size-by-size block of threads
addVectorKernel<<<dimGrid, dimBlock>>>(Md, Nd, Pd, size);
// Wait for completion
cudaDeviceSynchronize();
bool res = true;
// Check for errors
status = cudaGetLastError();
if (status != cudaSuccess) {
std::cout << "Kernel failed: " << cudaGetErrorString(status) << std::endl;
res = false;
}
// Retrieve the result matrix
//cudaHostGetDevicePointer((void**)&Pd, P, 0);
// Free device memory
cudaHostUnregister(M);
cudaHostUnregister(N);
cudaHostUnregister(P);
// Success
return res;
}
int main(){
// Timing data
float tcpuadd, tgpuadd;
clock_t start, end;
bool success;
//Allocate the four vectors of SIZE floats
float* M = new float[SIZE];
float* N = new float[SIZE];
float* Pcpu = new float[SIZE];
float* Pgpu = new float[SIZE];
//Initialize M and N to random integers
for (int i = 0; i < SIZE; i ++){
M[i] = rand()/(float)(RAND_MAX);
N[i] = rand()/(float)(RAND_MAX);
}
printf("Operating on a vector of length %d\n", SIZE);
//Add two vectors and compute timing in CPU
start = clock();
for (int i = 0; i < ITERS; i++) {
addVectorCPU(M, N, Pcpu, SIZE);
}
end = clock();
tcpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS;
printf( "CPU Addition took %f ms\n", tcpuadd);
//Add two vectors and compute timing in GPU
cudaSetDeviceFlags(cudaDeviceMapHost);
success = addVectorGPU(M, N ,Pgpu , SIZE);
if(!success)
{
printf("Device Error!\n");
return 1;
}
//compute GPU timing
start = clock();
for (int i = 0; i < ITERS; i++) {
addVectorGPU(M, N, Pgpu, SIZE);
}
end = clock();
tgpuadd = (float)(end - start) * 1000 / (float)CLOCKS_PER_SEC / ITERS;
printf("GPU Addition took %f ms\n", tgpuadd);
}
#包括
#包括
#包括
#包括
#定义平铺大小512
#定义尺寸1048576
#定义ITERS 10
bool addVectorCPU(浮点*M、浮点*N、浮点*P、整数大小){
对于(inti=0;i