Cuda 将设备上分配的类数据复制回主机
在我的代码中,我希望在内核执行期间为类的指针数据成员分配内存,并在之后写入。然后我想稍后在主机上获取这些数据。然而,在我的方法中,我没有在主机上获得正确的数据,请参见下文。我的方法是完全错误的还是你能发现错误的部分Cuda 将设备上分配的类数据复制回主机,cuda,Cuda,在我的代码中,我希望在内核执行期间为类的指针数据成员分配内存,并在之后写入。然后我想稍后在主机上获取这些数据。然而,在我的方法中,我没有在主机上获得正确的数据,请参见下文。我的方法是完全错误的还是你能发现错误的部分 #include <cuda_runtime.h> #include <stdio.h> class OutputData { public: int *data; }; __global__ void init(OutputData *buffer)
#include <cuda_runtime.h>
#include <stdio.h>
class OutputData {
public:
int *data;
};
__global__ void init(OutputData *buffer)
{
// allocate memory for data
buffer->data = (int*) malloc(sizeof(int)*2);
// write data
buffer->data[0] = 1;
buffer->data[1] = 2;
}
int main(int argc, char **argv)
{
// malloc device memory
OutputData *d_buffer;
cudaMalloc(&d_buffer, sizeof(OutputData));
// run kernel
init<<<1,1>>>(d_buffer);
cudaDeviceSynchronize();
// malloc host memory
OutputData *h_buffer = (OutputData*) malloc(sizeof(OutputData));
//transfer data from device to host
cudaMemcpy(h_buffer, d_buffer, sizeof(OutputData), cudaMemcpyDeviceToHost);
int* h_data = (int*) malloc(sizeof(int)*2);
cudaMemcpy(h_data, h_buffer->data, sizeof(int)*2, cudaMemcpyDeviceToHost);
// Print the data
printf("h_data[0] = %d, h_data[1] = %d\n", h_data[0], h_data[1]);
// free memory
cudaFree(h_buffer->data);
free(h_buffer);
cudaFree(d_buffer);
free(h_data);
return (0);
}
而不是
h_data[0] = 1, h_data[1] = 2
如预期。根据:
此外,设备malloc内存不能用于任何运行时或驱动程序API调用,如cudaMemcpy、cudaMemset等
为了确认这一点,让我们使用cuda memcheck运行您的代码:
这就是代码失败的原因-h_buffer->data处的地址无法访问主机API。还要注意,它也不能从主机上释放
您可以这样做,它使用托管内存分配作为主机内存,以便在内核中直接访问,还可以执行设备端的cudaMemcpyAsync调用:
还有其他变体,如从主机在全局内存中构建堆结构的完整镜像结构,然后运行复制内核,但是,这些都没有这样做的意义。主机无法在每次使用CUDA代码遇到问题时访问使用new或malloc在设备运行时堆上分配的内存,这通常是一种很好的做法,也可以使用CUDA memcheck运行代码。我通常建议人们在请求他人帮助之前先做这些事情。即使您不理解错误输出,它通常也会对那些试图帮助您的人有用。
h_data[0] = 1, h_data[1] = 2
$ nvcc -std=c++11 -arch=sm_52 -o heapcopy heapcopy.cu
$ cuda-memcheck ./heapcopy
========= CUDA-MEMCHECK
h_data[0] = 36791296, h_data[1] = 0
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3451c3]
========= Host Frame:./heapcopy [0x3cb0a]
========= Host Frame:./heapcopy [0x31ac]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
========= Host Frame:./heapcopy [0x2fd9]
=========
========= Program hit cudaErrorInvalidDevicePointer (error 17) due to "invalid device pointer" on CUDA API call to cudaFree.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3451c3]
========= Host Frame:./heapcopy [0x44f00]
========= Host Frame:./heapcopy [0x31dc]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
========= Host Frame:./heapcopy [0x2fd9]
=========
========= ERROR SUMMARY: 2 errors
#include <cuda_runtime.h>
#include <stdio.h>
class OutputData {
public:
int *data;
};
__global__ void init(OutputData *buffer)
{
// allocate memory for data
buffer->data = (int*) malloc(sizeof(int)*2);
// write data
buffer->data[0] = 1;
buffer->data[1] = 2;
}
__global__ void deepcopy(OutputData* dest, OutputData* source, size_t datasz)
{
cudaMemcpyAsync(dest->data, source->data, datasz, cudaMemcpyDeviceToDevice);
}
int main(int argc, char **argv)
{
// malloc device memory
OutputData *d_buffer;
cudaMalloc(&d_buffer, sizeof(OutputData));
// run kernel
init<<<1,1>>>(d_buffer);
cudaDeviceSynchronize();
// malloc host memory as managed memory
//OutputData *h_buffer = (OutputData*) malloc(sizeof(OutputData));
//int* h_data = (int*) malloc(sizeof(int)*2);
size_t dsize = sizeof(int)*2;
OutputData* h_buffer; cudaMallocManaged(&h_buffer, sizeof(OutputData));
int* h_data; cudaMallocManaged(&h_data, dsize);
h_buffer->data = h_data;
// run kernel
deepcopy<<<1,1>>>(h_buffer, d_buffer, dsize);
cudaDeviceSynchronize();
// Print the data
printf("h_data[0] = %d, h_data[1] = %d\n", h_data[0], h_data[1]);
// free memory
cudaFree(h_data);
cudaFree(h_buffer);
cudaFree(d_buffer);
return (0);
}
$ nvcc -std=c++11 -arch=sm_52 -dc -o heapcopy.o heapcopy.cu
$ nvcc -std=c++11 -arch=sm_52 -o heapcopy heapcopy.o
$ cuda-memcheck ./heapcopy
========= CUDA-MEMCHECK
h_data[0] = 1, h_data[1] = 2
========= ERROR SUMMARY: 0 errors