Concurrency cuda理解并发内核执行
我试图理解并发内核执行是如何工作的。我写了一个简单的程序来尝试理解它。内核将使用2个流填充2D数组。当有一个流,没有并发时,我得到了正确的结果。当我尝试使用2个流,尝试并发时,我得到了错误的结果。我相信这可能与内存传输有关,因为我不确定我是否正确,或者我设置内核的方式是否正确。编程指南对我解释得不够好。 出于我的目的,我需要Matlab调用内核 据我所知,主要项目将:Concurrency cuda理解并发内核执行,concurrency,cuda,Concurrency,Cuda,我试图理解并发内核执行是如何工作的。我写了一个简单的程序来尝试理解它。内核将使用2个流填充2D数组。当有一个流,没有并发时,我得到了正确的结果。当我尝试使用2个流,尝试并发时,我得到了错误的结果。我相信这可能与内存传输有关,因为我不确定我是否正确,或者我设置内核的方式是否正确。编程指南对我解释得不够好。 出于我的目的,我需要Matlab调用内核 据我所知,主要项目将: 在主机上分配固定内存 在GPU上分配单个流所需的内存(2个流=主机总内存的一半) 创建流 环游溪流 使用cudaMemcpyA
- 在主机上分配固定内存
- 在GPU上分配单个流所需的内存(2个流=主机总内存的一半)
- 创建流
- 环游溪流
- 使用cudaMemcpyAsync()将单个流的内存从主机复制到设备
- 为流执行内核
- 将流的内存复制回主机cudaMemcpyAsync()
- 我相信我所做的是正确的,使用基于每个流的数据大小和流编号的偏移量,从每个流所需的位置引用内存
- 摧毁溪流
- 释放内存
__global__ void concurrentKernel(int const width,
int const streamIdx,
double *array)
{
int thread = (blockIdx.x * blockDim.x) + threadIdx.x;;
for (int i = 0; i < width; i ++)
{
array[thread*width+i] = thread+i*width+1;
// array[thread*width+i+streamIdx] = thread+i*width+streamIdx*width/2;
}
}
\uuuu全局\uuuuu无效concurrentKernel(int const width,
int const streamIdx,
双*阵列)
{
int thread=(blockIdx.x*blockDim.x)+threadIdx.x;;
对于(int i=0;i
concurrentMexFunction.cu
#include <stdio.h>
#include <math.h>
#include "mex.h"
/* Kernel function */
#include "concurrentKernel.cpp"
void mexFunction(int nlhs,
mxArray *plhs[],
int nrhs,
mxArray *prhs[])
{
int const numberOfStreams = 2; // set number of streams to use here.
cudaError_t cudaError;
int offset;
int width, height, fullSize, streamSize;
width = 512;
height = 512;
fullSize = height*width;
streamSize = (int)(fullSize/numberOfStreams);
mexPrintf("fullSize: %d, streamSize: %d\n",fullSize, streamSize);
/* Return the populated array */
double *returnedArray;
plhs[0] = mxCreateDoubleMatrix(height, width, mxREAL);
returnedArray = mxGetPr(plhs[0]);
cudaStream_t stream[numberOfStreams];
for (int i = 0; i < numberOfStreams; i++)
{
cudaStreamCreate(&stream[i]);
}
/* host memory */
double *hostArray;
cudaError = cudaMallocHost(&hostArray,sizeof(double)*fullSize); // full size of array.
if (cudaError != cudaSuccess) {mexPrintf("hostArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }
for (int i = 0; i < height; i++)
{
for (int j = 0; j < width; j++)
{
hostArray[i*width+j] = -1.0;
}
}
/* device memory */
double *deviceArray;
cudaError = cudaMalloc( (void **)&deviceArray,sizeof(double)*streamSize); // size of array for each stream.
if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }
for (int i = 0; i < numberOfStreams; i++)
{
offset = i;//*streamSize;
mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);
cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]);
if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }
concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray);
cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize, cudaMemcpyDeviceToHost, stream[i]);
if (cudaError != cudaSuccess) {mexPrintf("returnedArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }
mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]);
}
for (int i = 0; i < numberOfStreams; i++)
{
cudaStreamDestroy(stream[i]);
}
cudaFree(hostArray);
cudaFree(deviceArray);
}
#包括
#包括
#包括“mex.h”
/*核函数*/
#包括“concurrentKernel.cpp”
无效MEX函数(整数nlhs,
mxArray*plhs[],
国际nrhs,
mxArray*prhs[])
{
int const numberOfStreams=2;//设置要在此处使用的流的数量。
cudaError\u t cudaError;
整数偏移量;
整型宽度、高度、全尺寸、流线型尺寸;
宽度=512;
高度=512;
全尺寸=高度*宽度;
streamSize=(int)(完整大小/数量流);
mexPrintf(“完整大小:%d,流大小:%d\n”,完整大小,流大小);
/*返回填充的数组*/
双*返回阵列;
plhs[0]=mxCreateDoubleMatrix(高度、宽度、mxREAL);
returnedArray=mxGetPr(plhs[0]);
cudaStream_t stream[numberOfStreams];
for(int i=0;i
当有两个流时,结果是一个零数组,这让我觉得内存有问题。
谁能解释我做错了什么?
如果有人需要从Matlab编译和运行这些代码,我可以提供相应的命令
更新:
for (int i = 0; i < numberOfStreams; i++)
{
offset = i*streamSize;
mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);
cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]);
if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }
concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray);
}
cudaDeviceSynchronize();
for (int i = 0; i < numberOfStreams; i++)
{
offset = i*streamSize;
mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);
cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize, cudaMemcpyDeviceToHost, stream[i]);
if (cudaError != cudaSuccess) {mexPrintf("returnedArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }
mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]);
cudaStreamDestroy(stream[i]);
}
for(int i=0;ifor (int i = 0; i < numberOfStreams; i++)
{
offset = i;//*streamSize;
mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);
cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize,
cudaMemcpyHostToDevice, stream[i]);
concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray);
cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize,
cudaMemcpyDeviceToHost, stream[i]);
}
// Host thread waits here until both kernels and copies are finished
cudaDeviceSynchronize();
for (int i = 0; i < numberOfStreams; i++)
{
mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]);
cudaStreamDestroy(stream[i]);
}
/* device memory */
double *deviceArray[numberOfStreams];
for (int i = 0; i < numberOfStreams; i++)
{
cudaError = cudaMalloc( (void **)&deviceArray[i],sizeof(double)*streamSize); // size of array for each stream.
if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }
}
for (int i = 0; i < numberOfStreams; i++)
{
offset = i;//*streamSize;
mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);
cudaMemcpyAsync(deviceArray[i], hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]);
if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }
concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray[i]);
cudaMemcpyAsync(returnedArray+offset, deviceArray[i], sizeof(double)*streamSize,
cudaMemcpyDeviceToHost, stream[i]);
}