cudaStream奇怪的表演
我尝试用cudaStream开发一个sobel示例。节目如下:cudaStream奇怪的表演,cuda,Cuda,我尝试用cudaStream开发一个sobel示例。节目如下: void SobelStream(void) { cv::Mat imageGrayL2 = cv::imread("/home/xavier/Bureau/Image1.png",0); u_int8_t *u8_PtImageHost; u_int8_t *u8_PtImageDevice; u_int8_t *u8_ptDataOutHost; u_int8_t *u8_ptDa
void SobelStream(void)
{
cv::Mat imageGrayL2 = cv::imread("/home/xavier/Bureau/Image1.png",0);
u_int8_t *u8_PtImageHost;
u_int8_t *u8_PtImageDevice;
u_int8_t *u8_ptDataOutHost;
u_int8_t *u8_ptDataOutDevice;
u_int8_t u8_Used[NB_STREAM];
u8_ptDataOutHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
checkCudaErrors(cudaMalloc((void**)&u8_ptDataOutDevice,WIDTH*HEIGHT*sizeof(u_int8_t)));
u8_PtImageHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
checkCudaErrors(cudaMalloc((void**)&u8_PtImageDevice,WIDTH*HEIGHT*sizeof(u_int8_t)));
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned char>();
checkCudaErrors(cudaMallocArray(&Array_PatchsMaxDevice, &channelDesc,WIDTH,HEIGHT ));
checkCudaErrors(cudaBindTextureToArray(Image,Array_PatchsMaxDevice));
dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)HEIGHT/BLOC_Y));
ClearKernel<<<blocks,threads>>>(u8_ptDataOutDevice,WIDTH,HEIGHT);
int blockh = HEIGHT/NB_STREAM;
Stream = (cudaStream_t *) malloc(NB_STREAM * sizeof(cudaStream_t));
for (int i = 0; i < NB_STREAM; i++)
{
checkCudaErrors(cudaStreamCreate(&(Stream[i])));
}
// for(int i=0;i<NB_STREAM;i++)
// {
// cudaSetDevice(0);
// cudaStreamCreate(&Stream[i]);
// }
cudaEvent_t Start;
cudaEvent_t Stop;
cudaEventCreate(&Start);
cudaEventCreate(&Stop);
cudaEventRecord(Start, 0);
//////////////////////////////////////////////////////////
for(int i=0;i<NB_STREAM;i++)
{
if(i == 0)
{
int localHeight = blockh;
checkCudaErrors(cudaMemcpy2DToArrayAsync( Array_PatchsMaxDevice,
0,
0,
imageGrayL2.data,//u8_PtImageDevice,
WIDTH,
WIDTH,
blockh,
cudaMemcpyHostToDevice ,
Stream[i]));
dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)localHeight/BLOC_Y));
SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,0,WIDTH,localHeight-1);
checkCudaErrors(cudaGetLastError());
u8_Used[i] = 1;
}else{
int ioffsetImage = WIDTH*(HEIGHT/NB_STREAM );
int hoffset = HEIGHT/NB_STREAM *i;
int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1);
int localHeight = min(HEIGHT - (blockh*i),blockh);
//printf("hoffset: %d hoffsetkernel %d localHeight %d rest %d ioffsetImage %d \n",hoffset,hoffsetkernel,localHeight,HEIGHT - (blockh +1 +blockh*(i-1)),ioffsetImage*i/WIDTH);
checkCudaErrors(cudaMemcpy2DToArrayAsync( Array_PatchsMaxDevice,
0,
hoffset,
&imageGrayL2.data[ioffsetImage*i],//&u8_PtImageDevice[ioffset*i],
WIDTH,
WIDTH,
localHeight,
cudaMemcpyHostToDevice ,
Stream[i]));
u8_Used[i] = 1;
if(HEIGHT - (blockh +1 +blockh*(i-1))<=0)
{
break;
}
}
}
///////////////////////////////////////////
for(int i=0;i<NB_STREAM;i++)
{
if(i == 0)
{
int localHeight = blockh;
dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(1,1);
SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,0,WIDTH,localHeight-1);
checkCudaErrors(cudaGetLastError());
u8_Used[i] = 1;
}else{
int ioffsetImage = WIDTH*(HEIGHT/NB_STREAM );
int hoffset = HEIGHT/NB_STREAM *i;
int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1);
int localHeight = min(HEIGHT - (blockh*i),blockh);
dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(1,1);
SobelKernel<<<blocks,threads,0,Stream[i]>>>(u8_ptDataOutDevice,hoffsetkernel,WIDTH,localHeight);
checkCudaErrors(cudaGetLastError());
u8_Used[i] = 1;
if(HEIGHT - (blockh +1 +blockh*(i-1))<=0)
{
break;
}
}
}
///////////////////////////////////////////////////////
for(int i=0;i<NB_STREAM;i++)
{
if(i == 0)
{
int localHeight = blockh;
checkCudaErrors(cudaMemcpyAsync(u8_ptDataOutHost,u8_ptDataOutDevice,WIDTH*(localHeight-1)*sizeof(u_int8_t),cudaMemcpyDeviceToHost,Stream[i]));
u8_Used[i] = 1;
}else{
int ioffsetImage = WIDTH*(HEIGHT/NB_STREAM );
int hoffset = HEIGHT/NB_STREAM *i;
int hoffsetkernel = HEIGHT/NB_STREAM -1 + HEIGHT/NB_STREAM* (i-1);
int localHeight = min(HEIGHT - (blockh*i),blockh);
checkCudaErrors(cudaMemcpyAsync(&u8_ptDataOutHost[hoffsetkernel*WIDTH],&u8_ptDataOutDevice[hoffsetkernel*WIDTH],WIDTH*localHeight*sizeof(u_int8_t),cudaMemcpyDeviceToHost,Stream[i]));
u8_Used[i] = 1;
if(HEIGHT - (blockh +1 +blockh*(i-1))<=0)
{
break;
}
}
}
for(int i=0;i<NB_STREAM;i++)
{
cudaStreamSynchronize(Stream[i]);
}
cudaEventRecord(Stop, 0);
cudaEventSynchronize(Start);
cudaEventSynchronize(Stop);
float dt_ms;
cudaEventElapsedTime(&dt_ms, Start, Stop);
printf("dt_ms %f \n",dt_ms);
}
void Sobestream(void)
{
cv::Mat imageGrayL2=cv::imread(“/home/xavier/Bureau/Image1.png”,0);
u_int8_t*u8_PtImageHost;
u_int8_t*u8_图像设备;
u_int8_t*u8_ptDataOutHost;
u_int8_t*u8_ptDataOutDevice;
u_int8_t u8_使用的[NB_流];
u8_ptDataOutHost=(u_int8_t*)malloc(宽度*高度*尺寸(u_int8_t));
检查CUDAERRORS(cudaMalloc((void**)和u8_ptDataOutDevice,宽度*高度*大小(u_int8_t));
u8_PtImageHost=(u_int8_t*)malloc(宽度*高度*尺寸(u_int8_t));
检查CUDAERRORS(cudaMalloc((void**)和u8图像设备,宽度*高度*尺寸(u_int8_t));
cudaChannelFormatDesc channelDesc=cudaCreateChannelDesc();
检查CUDAERRORS(cudaMallocArray(&Array_PatchsMaxDevice,&channelDesc,WIDTH,HEIGHT));
检查CUDAERRORS(cudaBindTextureToArray(图像,数组_PatchsMaxDevice));
dim3螺纹(块状X,块状Y);
dim3块(天花板((浮动)宽度/BLOC_X)、天花板((浮动)高度/BLOC_Y));
ClearKernel(U8ptDataOutDevice、宽度、高度);
int blockh=高度/NB_流;
Stream=(cudaStream_t*)malloc(NB_Stream*sizeof(cudaStream_t));
对于(int i=0;i //对于(inti=0;i首先,在将来,请提供完整的代码。我还将处理您的交叉发布,以填写一些详细信息,例如内核大小
您有两个问题需要解决:
首先,任何时候如果你想使用cudaMemcpyAsync
,你都很可能希望使用固定主机分配。如果你使用的是通过malloc
创建的分配,就异步并发执行而言,你将无法从cudaMemcpyAsync
中获得预期的行为。这种必要性是c吃得过饱:
如果复制涉及主机内存,则必须对其进行页锁定
因此,对代码所做的第一个更改是转换以下内容:
u8_PtImageHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
u8_ptDataOutHost = (u_int8_t *)malloc(WIDTH*HEIGHT*sizeof(u_int8_t));
为此:
checkCudaErrors(cudaHostAlloc(&u8_PtImageHost, WIDTH*HEIGHT*sizeof(u_int8_t), cudaHostAllocDefault));
checkCudaErrors(cudaHostAlloc(&u8_ptDataOutHost, WIDTH*HEIGHT*sizeof(u_int8_t), cudaHostAllocDefault));
根据我的测试,仅此更改一项,您的执行持续时间就从约21ms下降到7ms。原因是,如果没有更改,我们不会有任何重叠:
更改后,复制活动可以相互重叠(H->D和D->H)并与内核执行重叠:
要实现并发内核执行,您面临的第二个问题是内核太大(块/线程太多):
我建议,如果这些是您需要运行的内核的大小,那么尝试和争取内核重叠可能没有多大好处——每个内核都启动了足够的块来“填充”GPU,因此您已经公开了足够的并行性以保持GPU繁忙。但是,如果您迫切希望看到内核并发,您可以让您的内核使用更少的块,同时使每个内核花费更多的时间执行。我们可以通过启动1个块来实现这一点,并让每个块中的线程执行im年龄筛选。问题在哪里?什么是出乎意料的?
#define WIDTH 6400
#define HEIGHT 4800
#define NB_STREAM 10
#define BLOC_X 32
#define BLOC_Y 32
dim3 threads(BLOC_X,BLOC_Y);
dim3 blocks(ceil((float)WIDTH/BLOC_X),ceil((float)HEIGHT/BLOC_Y));