CUDA内核'；s的启动参数取决于先前的内核？_Cuda

CUDA内核'；s的启动参数取决于先前的内核？

cuda

CUDA内核'；s的启动参数取决于先前的内核？,cuda,Cuda,我的代码中经常有这种附加模式。基本上，它相当于一个用于过滤大型数据集的第一个内核，其中返回的选定条目将非常稀疏，然后是第二个内核，用于对大大减少的数据集执行更复杂的计算看起来cudaStreamSynchronize几乎是多余的，但我看不出有什么办法可以绕过它是否有一种替代模式可以避免内核之间的同步 CUDA动态并行性有什么帮助吗示例代码： /* Pseudocode. Won't Compile */ /* Please ignore silly mistakes/syntax an

我的代码中经常有这种附加模式。基本上，它相当于一个用于过滤大型数据集的第一个内核，其中返回的选定条目将非常稀疏，然后是第二个内核，用于对大大减少的数据集执行更复杂的计算

看起来cudaStreamSynchronize几乎是多余的，但我看不出有什么办法可以绕过它

是否有一种替代模式可以避免内核之间的同步
CUDA动态并行性有什么帮助吗

示例代码：

/* Pseudocode. Won't Compile */
/* Please ignore silly mistakes/syntax and inefficiant/incorrect simplifications */

__global__ void bar( const float * dataIn, float * dataOut, unsigned int * counter_ptr ) 
{
   < do some computation > 
   if (bConditionalComputedAboveIsTrue)
   { 
      const unsigned int ind = atomicInc(counter_ptr, (unsigned int)(-1));
      dataOut[ ind ] = resultOfAboveComputation;
   } 
}

int foo( float * d_datain, float* d_tempbuffer, float* d_output, cudaStream_t stream  ){    
   /* Initialize a counter that will be updated by the bar kernel */ 
   unsigned int * counter_ptr;
   cudaMalloc( &counter_ptr, sizeof( unsigned int) ); //< Create a Counter
   cudaMemsetAsync(counter_ptr, 0, sizeof(unsigned int), stream); //<Initially Set the Counter to 0
   dim3 threadsInit(16,16,1);
   dim3 gridInit(256, 1, 1);
   /* Launch the Filtering Kernel. This will update the value in counter_ptr*/
   bar<<< gridInit, threadsInit, 0, stream >>>( d_datain, d_tempbuffer, counter_ptr );
   /* Download the count and synchronize the stream */ 
   unsigned int count;
   cudaMemcpyAsync(&count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost, stream);
   cudaStreamSynchronize( stream ); //< Is there any way around this synchronize? 
   /* Compute the grid parameters and launch a second kernel */
   dim3 bazThreads(128,1,1);
   dim3 bazGrid( count/128 + 1, 1, 1); //< Here I use the counter modified in the prior kernel to set the grid parameters
   baz<<< bazGrid, bazThreads, 0, stream >>>( d_tempbuffer, d_output );
   /* cleanup */
   cudaFree(counter_ptr);
}

/*伪代码。不会编译*/
/*请忽略愚蠢的错误/语法和无效/不正确的简化*/
__全局无效条（常量浮点*数据输入，浮点*数据输出，无符号整数*计数器）
{
<做一些计算>
if（B条件计算结果）
{ 
常量unsigned int ind=atomicInc（计数器（unsigned int）（-1））；
dataOut[ind]=上述计算的结果；
} 
}
intfoo（float*d_数据输入，float*d_临时缓冲区，float*d_输出，cudaStream\t流）{
/*初始化将由bar内核更新的计数器*/
无符号整数*计数器ptr；
cudamaloc（&counter_ptr，sizeof（unsigned int））；//<创建一个计数器
cudaMemsetAsync（计数器ptr，0，sizeof（无符号整数），流）；/（数据输入，数据缓冲，计数器ptr）；
/*下载计数并同步流*/
无符号整数计数；
cudaMemcpyAsync（&count，counter_ptr，sizeof（unsigned int），cudaMemcpyDeviceToHost，stream）；
cudaStreamSynchronize（stream）；//<有什么方法可以解决这个同步问题吗？
/*计算网格参数并启动第二个内核*/
dim3螺纹（128,1,1）；
dim3 bazGrid（count/128+1,1,1）；//<这里我使用先前内核中修改的计数器来设置网格参数
baz>（d_临时缓冲区，d_输出）；
/*清理*/
cudaFree（柜台）；
}

您可以使用固定的块计数，让块适应它们所做的工作量，而不是改变第二个内核中的块数

例如，启动更多的模块，如果没有剩余工作，让它们提前退出。或者启动刚好足够的块来填充设备，并让每个块在工作中循环。这是一个很好的方法