如何管理CUDA内核执行链中的if/else条件,以确保在GPU上不间断地执行?

如何管理CUDA内核执行链中的if/else条件,以确保在GPU上不间断地执行?,cuda,Cuda,我有许多图像块作为CUDA内核系列的输入。在此执行链中,一个步骤的输出被用作后续步骤的输入,而无需将中间输出复制回主机内存中 cudaKernel1(inputImage, out1, stream); cudaKernel2(out1, out2, stream); cudaKernel3(out2, out3, stream); .... cudaKernelN(..., ..., stream); cudaKernel1(inputImage, output1, stream); cud

我有许多图像块作为CUDA内核系列的输入。在此执行链中,一个步骤的输出被用作后续步骤的输入,而无需将中间输出复制回主机内存中

cudaKernel1(inputImage, out1, stream);
cudaKernel2(out1, out2, stream);
cudaKernel3(out2, out3, stream);
....
cudaKernelN(..., ..., stream);
cudaKernel1(inputImage, output1, stream);
cudaKernel2(out1, out2, stream);
cudaKernel3(out2, out3, stream);
....
cudaKernel11(out10, out11,stream);

copyDtoHAsync(temp,out11, stream);

cuStreamSynchronize(stream);

if(SOME_CONDITION_ON_temp)
{ 
    cudaKernel12(out11, out12, stream);
    cudaKernel13(out12, out13, stream);
    cudaKernel14(out13, out14, stream);
    .........
}
但对于特定场景,我必须在执行链中包含
if/else
条件,为此我必须将结果复制回主机内存

cudaKernel1(inputImage, out1, stream);
cudaKernel2(out1, out2, stream);
cudaKernel3(out2, out3, stream);
....
cudaKernelN(..., ..., stream);
cudaKernel1(inputImage, output1, stream);
cudaKernel2(out1, out2, stream);
cudaKernel3(out2, out3, stream);
....
cudaKernel11(out10, out11,stream);

copyDtoHAsync(temp,out11, stream);

cuStreamSynchronize(stream);

if(SOME_CONDITION_ON_temp)
{ 
    cudaKernel12(out11, out12, stream);
    cudaKernel13(out12, out13, stream);
    cudaKernel14(out13, out14, stream);
    .........
}
在上述场景中,
copyDtoHAsync
cuStreamSynchronize
if
调用是流阻塞调用

假设我有100个输入块作为输入,同时在多个GPU流上执行。如果条件为真,则对于40个瓷砖,对于其余60个瓷砖,则为假。管理此类中间阻塞调用的最佳方法是什么?如何确保在GPU上不间断地执行这40个tile,而不会因为那些阻塞调用而被阻塞

任何职位,类似的问题,相关的例子将不胜感激

您可以尝试使用。差不多

cudaStream_t streams[num_tiles];
#pragma omp parallel for
for(int i=0; i<num_tiles; i++)
{
    cudaStreamCreate( &streams[i]);

    cudaKernel1(inputImage, output1, stream[i]);
    cudaKernel2(out1, out2, stream[i]);
    cudaKernel3(out2, out3, stream[i]);
    ...
    cudaKernel11(out10, out11,stream[i]);

    copyDtoHAsync(temp,out11, stream[i]);

    cuStreamSynchronize(stream[i]);

    if(SOME_CONDITION_ON_temp)
    { 
        cudaKernel12(out11, out12, stream[i]);
        cudaKernel13(out12, out13, stream[i]);
        cudaKernel14(out13, out14, stream[i]);
        .........
    }

    cudaStreamDestroy(stream[i]);
}
cudaStream\u t streams[num\u tiles];
#pragma-omp并行

for(int i=0;我将算法的if部分移动到GPU。@RobertCrovella是的,但是如果“if”的结果是正确的或错误的,则需要向“if”代码块中的所有其他内核发送一个标志。此外,在这种设计中,我还必须发送if块内核中的所有100个分片,这是我不想要的。