Cuda 多GPU代码中的异步纹理对象分配
我有一些用于纹理对象分配和主机到设备复制的代码。这只是对答案的修改。我没有显式地使用流,只是Cuda 多GPU代码中的异步纹理对象分配,cuda,Cuda,我有一些用于纹理对象分配和主机到设备复制的代码。这只是对答案的修改。我没有显式地使用流,只是cudaSetDevice() 这段代码运行得很好,但是,当我运行VisualProfiler时,我可以看到从主机到阵列的内存拷贝不是异步的。它们被分配给各自的设备流,但第二个设备流直到第一个设备流完成(在2个GPU上运行)才会启动。我已经尝试过使用大型图像,所以我确保它不会占用CPU的开销 我的猜测是,代码中有一些东西需要同步,因此会停止CPU,但我不知道是什么。我该怎么做才能使这个循环异步 MCVE:
cudaSetDevice()
这段代码运行得很好,但是,当我运行VisualProfiler时,我可以看到从主机到阵列的内存拷贝不是异步的。它们被分配给各自的设备流,但第二个设备流直到第一个设备流完成(在2个GPU上运行)才会启动。我已经尝试过使用大型图像,所以我确保它不会占用CPU的开销
我的猜测是,代码中有一些东西需要同步,因此会停止CPU,但我不知道是什么。我该怎么做才能使这个循环异步
MCVE:
void CreateTexture(int num_devices,float* imagedata, int nVoxelX, int nVoxelY, int nVoxelZ ,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage);
int main(void)
{
int deviceCount =0 ;
cudaGetDeviceCount(&deviceCount);
int nVoxelX=512;
int nVoxelY=512;
int nVoxelZ=512;
float* image=(float*)malloc(nVoxelX*nVoxelY*nVoxelZ*sizeof(float));
cudaTextureObject_t *texImg =new cudaTextureObject_t[deviceCount];
cudaArray **d_cuArrTex = new cudaArray*[deviceCount];
CreateTexture(deviceCount,image, nVoxelX,nVoxelY, nVoxelZ,d_cuArrTex,texImg);
}
实际功能:
void CreateTexture(int num_devices, float* imagedata, int nVoxelX, int nVoxelY, int nVoxelZ ,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage)
{
//size_t size_image=nVoxelX*nVoxelY*nVoxelZ;
for (unsigned int i = 0; i < num_devices; i++){
cudaSetDevice(i);
//cudaArray Descriptor
const cudaExtent extent = make_cudaExtent(nVoxelX, nVoxelY, nVoxelZ);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
//cuda Array
cudaMalloc3DArray(&d_cuArrTex[i], &channelDesc, extent);
//cudaCheckErrors("Texture memory allocation fail");
cudaMemcpy3DParms copyParams = {0};
//Array creation
copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
copyParams.dstArray = d_cuArrTex[i];
copyParams.extent = extent;
copyParams.kind = cudaMemcpyHostToDevice;
cudaMemcpy3DAsync(©Params);
//cudaCheckErrors("Texture memory data copy fail");
//Array creation End
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = d_cuArrTex[i];
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = false;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeBorder;
texDescr.addressMode[1] = cudaAddressModeBorder;
texDescr.addressMode[2] = cudaAddressModeBorder;
texDescr.readMode = cudaReadModeElementType;
cudaCreateTextureObject(&texImage[i], &texRes, &texDescr, NULL);
//cudaCheckErrors("Texture object creation fail");
}
}
void CreateTexture(int num_设备、float*imagedata、int nVoxelX、int nVoxelY、int nVoxelZ、cudaArray**d_cuArrTex、cudaTextureObject*texImage)
{
//大小大小图像=nVoxelX*nVoxelY*nVoxelZ;
对于(无符号整数i=0;i
我可以看到代码中的两个主要问题是:
cudamaloc3darray
)正在同步。我还没有运行测试来确定cudaCreateTextureObject
是否正在同步,但如果正在同步,我也不会感到惊讶。因此,我对异步的一般建议是从循环中获取同步操作nvprof
的角度来看,这似乎允许操作重叠:
$ cat t399.cu
void CreateTexture(int num_devices, float* imagedata, int nVoxelX, int nVoxelY, int nVoxelZ ,cudaArray** d_cuArrTex, cudaTextureObject_t *texImage)
{
//size_t size_image=nVoxelX*nVoxelY*nVoxelZ;
const cudaExtent extent = make_cudaExtent(nVoxelX, nVoxelY, nVoxelZ);
for (unsigned int i = 0; i < num_devices; i++){
cudaSetDevice(i);
//cudaArray Descriptor
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
//cuda Array
cudaMalloc3DArray(&d_cuArrTex[i], &channelDesc, extent);
//cudaCheckErrors("Texture memory allocation fail");
}
for (unsigned int i = 0; i < num_devices; i++){
cudaSetDevice(i);
cudaMemcpy3DParms copyParams = {0};
//Array creation
copyParams.srcPtr = make_cudaPitchedPtr((void *)imagedata, extent.width*sizeof(float), extent.width, extent.height);
copyParams.dstArray = d_cuArrTex[i];
copyParams.extent = extent;
copyParams.kind = cudaMemcpyHostToDevice;
cudaMemcpy3DAsync(©Params);
//cudaCheckErrors("Texture memory data copy fail");
}
for (unsigned int i = 0; i < num_devices; i++){
cudaSetDevice(i);
//Array creation End
cudaResourceDesc texRes;
memset(&texRes, 0, sizeof(cudaResourceDesc));
texRes.resType = cudaResourceTypeArray;
texRes.res.array.array = d_cuArrTex[i];
cudaTextureDesc texDescr;
memset(&texDescr, 0, sizeof(cudaTextureDesc));
texDescr.normalizedCoords = false;
texDescr.filterMode = cudaFilterModePoint;
texDescr.addressMode[0] = cudaAddressModeBorder;
texDescr.addressMode[1] = cudaAddressModeBorder;
texDescr.addressMode[2] = cudaAddressModeBorder;
texDescr.readMode = cudaReadModeElementType;
cudaCreateTextureObject(&texImage[i], &texRes, &texDescr, NULL);
//cudaCheckErrors("Texture object creation fail");
}
for (unsigned int i = 0; i < num_devices; i++){
cudaSetDevice(i);
cudaDeviceSynchronize();
}
}
int main(void)
{
int deviceCount =0 ;
cudaGetDeviceCount(&deviceCount);
int nVoxelX=512;
int nVoxelY=512;
int nVoxelZ=512;
float* image;
cudaHostAlloc(&image, nVoxelX*nVoxelY*nVoxelZ*sizeof(float), cudaHostAllocDefault);
cudaTextureObject_t *texImg =new cudaTextureObject_t[deviceCount];
cudaArray **d_cuArrTex = new cudaArray*[deviceCount];
CreateTexture(deviceCount,image, nVoxelX,nVoxelY, nVoxelZ,d_cuArrTex,texImg);
}
$ nvcc -o t399 t399.cu
$ cuda-memcheck ./t399
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof --print-gpu-trace ./t399
==19953== NVPROF is profiling process 19953, command: ./t399
==19953== Profiling application: ./t399
==19953== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput SrcMemType DstMemType Device Context Stream Name
1.55311s 90.735ms - - - - - 512.00MB 5.5106GB/s Pinned Array Tesla P100-PCIE 1 7 [CUDA memcpy HtoA]
1.55316s 90.640ms - - - - - 512.00MB 5.5163GB/s Pinned Array Tesla K40m (1) 2 18 [CUDA memcpy HtoA]
1.55318s 85.962ms - - - - - 512.00MB 5.8165GB/s Pinned Array Tesla K20Xm (2) 3 29 [CUDA memcpy HtoA]
1.55320s 89.908ms - - - - - 512.00MB 5.5612GB/s Pinned Array Tesla K20Xm (3) 4 40 [CUDA memcpy HtoA]
Regs: Number of registers used per CUDA thread. This number includes registers used internally by the CUDA driver and/or tools and can be more than what the compiler shows.
SSMem: Static shared memory allocated per CUDA block.
DSMem: Dynamic shared memory allocated per CUDA block.
SrcMemType: The type of source memory accessed by memory operation/copy
DstMemType: The type of destination memory accessed by memory operation/copy
$
$cat t399.cu
void CreateTexture(int num_设备、float*imagedata、int nVoxelX、int nVoxelY、int nVoxelZ、cudaArray**d_cuArrTex、cudaTextureObject_t*texImage)
{
//大小大小图像=nVoxelX*nVoxelY*nVoxelZ;
const cudaExtent extent=make_cudaExtent(nVoxelX、nVoxelY、nVoxelZ);
对于(无符号整数i=0;i