Cuda 前一个块上的活动等待，不带atomicAdd（x，0）_Cuda

Cuda 前一个块上的活动等待，不带atomicAdd（x，0）

cuda

Cuda 前一个块上的活动等待，不带atomicAdd（x，0）,cuda,Cuda,我有一个CUDA内核，其中一个块需要在执行新块之前完成之前的每个块。我使用两个原子计数器实现同步，如下所示： __global__ static void waitTest(int* counters) { __shared__ int orderedBlockId; int tid = threadIdx.x; if(tid == 0){ orderedBlockId = atomicAdd(&counters[0], 1 );

我有一个CUDA内核，其中一个块需要在执行新块之前完成之前的每个块。我使用两个原子计数器实现同步，如下所示：

__global__
static void waitTest(int* counters)
{
    __shared__ int orderedBlockId;
    int tid = threadIdx.x;

    if(tid == 0){
        orderedBlockId = atomicAdd(&counters[0], 1 );
        //wait on previous group of 16 blocks
        int expectedCounter = orderedBlockId / 16 * 16;
        while(atomicAdd(&counters[1],0) < expectedCounter){
            //wait
        }
    }
    __syncthreads();

    //do something

    __syncthreads();

    if(tid == 0){
        atomicAdd( &counters[1], 1 );
    }
}

int main(){
    thrust::device_vector<int> counters(2,0);
    waitTest<<<128,128>>>(thrust::raw_pointer_cast(counters.data()));
}

\u全局__
静态无效等待测试（int*计数器）
{
__shared_u u; int orderedBlockId；
int tid=threadIdx.x；
如果（tid==0）{
orderedBlockId=atomicAdd（&counters[0]，1）；
//等待前一组16个区块
int expectedCounter=orderedBlockId/16*16；
while（原子添加（&计数器[1]，0）


我的问题:
有没有更便宜的方法来强制全局内存读取而不是atomicAdd（&counters[1]，0）？
将其替换为计数器[1]
会使内核超时。
正如Robert在评论中提到的计数器必须声明为易失性。为了将计数器传递给原子添加，必须将计数器转换回其原始类型（无volatile
）。代码：
\u全局__
静态void waitTest（volatile int*计数器）
{
__shared_u u; int orderedBlockId；
int tid=threadIdx.x；
如果（tid==0）{
orderedBlockId=atomicAdd（（int*）和计数器[0]，1）；
//等待前一组16个区块
int expectedCounter=orderedBlockId/16*16；
while（计数器[1]
将计数器标记为volatile
@RobertCrovella然后我得到没有重载函数“atomicAdd”的实例与参数列表匹配
在atomicAdd
中使用时将其转换为非volatile类型读而不写的整数始终是原子的。所以您不会读取2字节的旧值和2字节的新值atomicAdd（&counters[1]，0）
可以安全地替换为volatile
读取，因为它总是写入与读取相同的值，因此可以删除写入。是的，但问题是什么？这两种操作都是原子操作。为每个块分配orderedBlockId
，而不是直接使用blockId.x
的想法是避免在较低块id之前使用sm的块id较高，这可能会导致不必要的暂停（最坏情况是死锁）。是吗？没错。不能保证具有低blockId.x
的块在具有高blockId.x的块之前执行。
__global__
static void waitTest(volatile int* counters)
{
    __shared__ int orderedBlockId;
    int tid = threadIdx.x;

    if(tid == 0){
        orderedBlockId = atomicAdd( (int*) &counters[0], 1 );
        //wait on previous group of 16 blocks
        int expectedCounter = orderedBlockId / 16 * 16;
        while(counters[1] < expectedCounter){
            //wait
        }
    }
    __syncthreads();

    //do something

    __syncthreads();

    if(tid == 0){
        atomicAdd( (int*) &counters[1], 1 );
    }
}