Concurrency cuda理解并发内核执行_Concurrency_Cuda

Concurrency cuda理解并发内核执行

concurrency cuda

Concurrency cuda理解并发内核执行,concurrency,cuda,Concurrency,Cuda,我试图理解并发内核执行是如何工作的。我写了一个简单的程序来尝试理解它。内核将使用2个流填充2D数组。当有一个流，没有并发时，我得到了正确的结果。当我尝试使用2个流，尝试并发时，我得到了错误的结果。我相信这可能与内存传输有关，因为我不确定我是否正确，或者我设置内核的方式是否正确。编程指南对我解释得不够好。出于我的目的，我需要Matlab调用内核据我所知，主要项目将：在主机上分配固定内存在GPU上分配单个流所需的内存（2个流=主机总内存的一半）创建流环游溪流使用cudaMemcpyA

我试图理解并发内核执行是如何工作的。我写了一个简单的程序来尝试理解它。内核将使用2个流填充2D数组。当有一个流，没有并发时，我得到了正确的结果。当我尝试使用2个流，尝试并发时，我得到了错误的结果。我相信这可能与内存传输有关，因为我不确定我是否正确，或者我设置内核的方式是否正确。编程指南对我解释得不够好。出于我的目的，我需要Matlab调用内核

据我所知，主要项目将：

在主机上分配固定内存
在GPU上分配单个流所需的内存（2个流=主机总内存的一半）
创建流
环游溪流
使用cudaMemcpyAsync（）将单个流的内存从主机复制到设备
为流执行内核
将流的内存复制回主机cudaMemcpyAsync（）
- 我相信我所做的是正确的，使用基于每个流的数据大小和流编号的偏移量，从每个流所需的位置引用内存
摧毁溪流
释放内存

下面是我试图使用的代码

concurrentKernel.cpp

__global__ void concurrentKernel(int const width, 
                                  int const streamIdx,
                                  double *array)
 {
     int thread = (blockIdx.x * blockDim.x) + threadIdx.x;;

     for (int i = 0; i < width; i ++)
     {
        array[thread*width+i] = thread+i*width+1;
//         array[thread*width+i+streamIdx] = thread+i*width+streamIdx*width/2;
     }

 }

\uuuu全局\uuuuu无效concurrentKernel（int const width，
int const streamIdx，
双*阵列）
{
int thread=（blockIdx.x*blockDim.x）+threadIdx.x；；
对于（int i=0；i


concurrentMexFunction.cu
#include <stdio.h>
#include <math.h>
#include "mex.h"

/* Kernel function */
#include "concurrentKernel.cpp"


void mexFunction(int        nlhs,
                 mxArray    *plhs[],
                 int        nrhs,
                 mxArray    *prhs[])
{

    int const numberOfStreams = 2; // set number of streams to use here.
    cudaError_t cudaError;
    int offset;

    int width, height, fullSize, streamSize;
    width = 512;
    height = 512;
    fullSize = height*width;
    streamSize = (int)(fullSize/numberOfStreams);
    mexPrintf("fullSize: %d, streamSize: %d\n",fullSize, streamSize);

    /* Return the populated array */
    double *returnedArray;
    plhs[0] = mxCreateDoubleMatrix(height, width, mxREAL);
    returnedArray = mxGetPr(plhs[0]);

    cudaStream_t stream[numberOfStreams];
    for (int i = 0; i < numberOfStreams; i++)
    {
        cudaStreamCreate(&stream[i]);    
    }

    /* host memory */
    double *hostArray;
    cudaError = cudaMallocHost(&hostArray,sizeof(double)*fullSize);    // full size of array.
    if (cudaError != cudaSuccess) {mexPrintf("hostArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

    for (int i = 0; i < height; i++)
    {
        for (int j = 0; j < width; j++)
        {
            hostArray[i*width+j] = -1.0;
        }
    }

    /* device memory */
    double *deviceArray;
    cudaError = cudaMalloc( (void **)&deviceArray,sizeof(double)*streamSize);    // size of array for each stream.
    if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }


    for (int i = 0; i < numberOfStreams; i++)
    {
        offset = i;//*streamSize;
        mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);

        cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]);
        if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

        concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray);

        cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize, cudaMemcpyDeviceToHost, stream[i]);
        if (cudaError != cudaSuccess) {mexPrintf("returnedArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

        mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]);
    }


    for (int i = 0; i < numberOfStreams; i++)
    {
        cudaStreamDestroy(stream[i]);    
    }

    cudaFree(hostArray);
    cudaFree(deviceArray);

}

#包括
#包括
#包括“mex.h”
/*核函数*/
#包括“concurrentKernel.cpp”
无效MEX函数（整数nlhs，
mxArray*plhs[]，
国际nrhs，
mxArray*prhs[]）
{
int const numberOfStreams=2；//设置要在此处使用的流的数量。
cudaError\u t cudaError；
整数偏移量；
整型宽度、高度、全尺寸、流线型尺寸；
宽度=512；
高度=512；
全尺寸=高度*宽度；
streamSize=（int）（完整大小/数量流）；
mexPrintf（“完整大小：%d，流大小：%d\n”，完整大小，流大小）；
/*返回填充的数组*/
双*返回阵列；
plhs[0]=mxCreateDoubleMatrix（高度、宽度、mxREAL）；
returnedArray=mxGetPr（plhs[0]）；
cudaStream_t stream[numberOfStreams]；
for（int i=0；i

当有两个流时，结果是一个零数组，这让我觉得内存有问题。
谁能解释我做错了什么？
如果有人需要从Matlab编译和运行这些代码，我可以提供相应的命令
更新：
for (int i = 0; i < numberOfStreams; i++)
{
    offset = i*streamSize;
    mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);

    cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]);
    if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

    concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray);


}
cudaDeviceSynchronize();


for (int i = 0; i < numberOfStreams; i++)
{
    offset = i*streamSize;
    mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);

    cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize, cudaMemcpyDeviceToHost, stream[i]);
    if (cudaError != cudaSuccess) {mexPrintf("returnedArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

    mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]);

    cudaStreamDestroy(stream[i]);    
}

for（int i=0；ifor (int i = 0; i < numberOfStreams; i++) 
{ 
    offset = i;//*streamSize; 
    mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset); 

    cudaMemcpyAsync(deviceArray, hostArray+offset, sizeof(double)*streamSize, 
                    cudaMemcpyHostToDevice, stream[i]); 

    concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray); 

    cudaMemcpyAsync(returnedArray+offset, deviceArray, sizeof(double)*streamSize,
                    cudaMemcpyDeviceToHost, stream[i]); 
} 

// Host thread waits here until both kernels and copies are finished
cudaDeviceSynchronize();

for (int i = 0; i < numberOfStreams; i++) 
{ 
    mexPrintf("returnedArray[offset]: %g, [end]: %g\n",returnedArray[offset/sizeof(double)],returnedArray[(i+1)*streamSize-1]); 
    cudaStreamDestroy(stream[i]);     
} 

/* device memory */
double *deviceArray[numberOfStreams];
for (int i = 0; i < numberOfStreams; i++)
{
    cudaError = cudaMalloc( (void **)&deviceArray[i],sizeof(double)*streamSize);    // size of array for each stream.
    if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }
}

for (int i = 0; i < numberOfStreams; i++)
{
    offset = i;//*streamSize;
    mexPrintf("offset: %d, element: %d\n",offset*sizeof(double),offset);

    cudaMemcpyAsync(deviceArray[i], hostArray+offset, sizeof(double)*streamSize, cudaMemcpyHostToDevice, stream[i]);
    if (cudaError != cudaSuccess) {mexPrintf("deviceArray memory allocation failed\n********** Error: %s **********\n",cudaGetErrorString(cudaError)); return; }

    concurrentKernel<<<1, 512, 0, stream[i]>>>(width, i, deviceArray[i]); 

    cudaMemcpyAsync(returnedArray+offset, deviceArray[i], sizeof(double)*streamSize,
                    cudaMemcpyDeviceToHost, stream[i]);     
}