Cuda 将设备上分配的类数据复制回主机

Cuda 将设备上分配的类数据复制回主机,cuda,Cuda,在我的代码中,我希望在内核执行期间为类的指针数据成员分配内存,并在之后写入。然后我想稍后在主机上获取这些数据。然而,在我的方法中,我没有在主机上获得正确的数据,请参见下文。我的方法是完全错误的还是你能发现错误的部分 #include <cuda_runtime.h> #include <stdio.h> class OutputData { public: int *data; }; __global__ void init(OutputData *buffer)

在我的代码中,我希望在内核执行期间为类的指针数据成员分配内存,并在之后写入。然后我想稍后在主机上获取这些数据。然而,在我的方法中,我没有在主机上获得正确的数据,请参见下文。我的方法是完全错误的还是你能发现错误的部分

#include <cuda_runtime.h>
#include <stdio.h>

class OutputData {
public:
  int *data;
};

__global__ void init(OutputData *buffer)
{
  // allocate memory for data
  buffer->data = (int*) malloc(sizeof(int)*2);

  // write data
  buffer->data[0] = 1;
  buffer->data[1] = 2;
}

int main(int argc, char **argv)
{
  // malloc device memory
  OutputData *d_buffer;
  cudaMalloc(&d_buffer, sizeof(OutputData));

  // run kernel
  init<<<1,1>>>(d_buffer);
  cudaDeviceSynchronize();

  // malloc host memory
  OutputData *h_buffer = (OutputData*) malloc(sizeof(OutputData));

  //transfer data from device to host
  cudaMemcpy(h_buffer, d_buffer, sizeof(OutputData), cudaMemcpyDeviceToHost);
  int* h_data = (int*) malloc(sizeof(int)*2);
  cudaMemcpy(h_data, h_buffer->data, sizeof(int)*2, cudaMemcpyDeviceToHost);

  // Print the data
  printf("h_data[0] = %d, h_data[1] = %d\n", h_data[0], h_data[1]);

  // free memory
  cudaFree(h_buffer->data);
  free(h_buffer);
  cudaFree(d_buffer);
  free(h_data);

  return (0);
}

而不是

h_data[0] = 1, h_data[1] = 2
如预期。

根据:

此外,设备malloc内存不能用于任何运行时或驱动程序API调用,如cudaMemcpy、cudaMemset等

为了确认这一点,让我们使用cuda memcheck运行您的代码:

这就是代码失败的原因-h_buffer->data处的地址无法访问主机API。还要注意,它也不能从主机上释放

您可以这样做,它使用托管内存分配作为主机内存,以便在内核中直接访问,还可以执行设备端的cudaMemcpyAsync调用:


还有其他变体,如从主机在全局内存中构建堆结构的完整镜像结构,然后运行复制内核,但是,这些都没有这样做的意义。

主机无法在每次使用CUDA代码遇到问题时访问使用new或malloc在设备运行时堆上分配的内存,这通常是一种很好的做法,也可以使用CUDA memcheck运行代码。我通常建议人们在请求他人帮助之前先做这些事情。即使您不理解错误输出,它通常也会对那些试图帮助您的人有用。
h_data[0] = 1, h_data[1] = 2
$ nvcc -std=c++11 -arch=sm_52 -o heapcopy heapcopy.cu 
$ cuda-memcheck ./heapcopy
========= CUDA-MEMCHECK
h_data[0] = 36791296, h_data[1] = 0
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemcpy. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3451c3]
=========     Host Frame:./heapcopy [0x3cb0a]
=========     Host Frame:./heapcopy [0x31ac]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
=========     Host Frame:./heapcopy [0x2fd9]
=========
========= Program hit cudaErrorInvalidDevicePointer (error 17) due to "invalid device pointer" on CUDA API call to cudaFree. 
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x3451c3]
=========     Host Frame:./heapcopy [0x44f00]
=========     Host Frame:./heapcopy [0x31dc]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
=========     Host Frame:./heapcopy [0x2fd9]
=========
========= ERROR SUMMARY: 2 errors
#include <cuda_runtime.h>
#include <stdio.h>

class OutputData {
public:
  int *data;
};

__global__ void init(OutputData *buffer)
{
  // allocate memory for data
  buffer->data = (int*) malloc(sizeof(int)*2);

  // write data
  buffer->data[0] = 1;
  buffer->data[1] = 2;
}

__global__ void deepcopy(OutputData* dest, OutputData* source, size_t datasz)
{
    cudaMemcpyAsync(dest->data, source->data, datasz, cudaMemcpyDeviceToDevice);
}

int main(int argc, char **argv)
{
  // malloc device memory
  OutputData *d_buffer;
  cudaMalloc(&d_buffer, sizeof(OutputData));

  // run kernel
  init<<<1,1>>>(d_buffer);
  cudaDeviceSynchronize();

  // malloc host memory as managed memory
  //OutputData *h_buffer = (OutputData*) malloc(sizeof(OutputData));
  //int* h_data = (int*) malloc(sizeof(int)*2);
  size_t dsize = sizeof(int)*2;
  OutputData* h_buffer; cudaMallocManaged(&h_buffer, sizeof(OutputData));
  int* h_data; cudaMallocManaged(&h_data, dsize);
  h_buffer->data = h_data;

  // run kernel
  deepcopy<<<1,1>>>(h_buffer, d_buffer, dsize);
  cudaDeviceSynchronize();

  // Print the data
  printf("h_data[0] = %d, h_data[1] = %d\n", h_data[0], h_data[1]);

  // free memory
  cudaFree(h_data);
  cudaFree(h_buffer);
  cudaFree(d_buffer);

  return (0);
}
$ nvcc -std=c++11 -arch=sm_52 -dc -o heapcopy.o heapcopy.cu 
$ nvcc -std=c++11 -arch=sm_52 -o heapcopy heapcopy.o 
$ cuda-memcheck ./heapcopy
========= CUDA-MEMCHECK
h_data[0] = 1, h_data[1] = 2
========= ERROR SUMMARY: 0 errors