在CUDA中实现关键部分

在CUDA中实现关键部分,cuda,synchronization,locking,critical-section,Cuda,Synchronization,Locking,Critical Section,我试图使用原子指令在CUDA中实现一个关键部分,但遇到了一些麻烦。我创建了测试程序来显示问题: #include <cuda_runtime.h> #include <cutil_inline.h> #include <stdio.h> __global__ void k_testLocking(unsigned int* locks, int n) { int id = threadIdx.x % n; while (atomicExch(

我试图使用原子指令在CUDA中实现一个关键部分,但遇到了一些麻烦。我创建了测试程序来显示问题:

#include <cuda_runtime.h>
#include <cutil_inline.h>
#include <stdio.h>

__global__ void k_testLocking(unsigned int* locks, int n) {
    int id = threadIdx.x % n;
    while (atomicExch(&(locks[id]), 1u) != 0u) {} //lock
    //critical section would go here
    atomicExch(&(locks[id]),0u); //unlock
}

int main(int argc, char** argv) {
    //initialize the locks array on the GPU to (0...0)
    unsigned int* locks;
    unsigned int zeros[10]; for (int i = 0; i < 10; i++) {zeros[i] = 0u;}
    cutilSafeCall(cudaMalloc((void**)&locks, sizeof(unsigned int)*10));
    cutilSafeCall(cudaMemcpy(locks, zeros, sizeof(unsigned int)*10, cudaMemcpyHostToDevice));

    //Run the kernel:
    k_testLocking<<<dim3(1), dim3(256)>>>(locks, 10);

    //Check the error messages:
    cudaError_t error = cudaGetLastError();
    cutilSafeCall(cudaFree(locks));
    if (cudaSuccess != error) {
        printf("error 1: CUDA ERROR (%d) {%s}\n", error, cudaGetErrorString(error));
        exit(-1);
    }
    return 0;
}

好吧,我知道了,这是cuda范式的另一个痛苦

正如任何一个优秀的cuda程序员所知道的(注意,我不记得这一点,我认为这使我成为一个糟糕的cuda程序员),warp中的所有线程都必须执行相同的代码。如果不是因为这个事实,我写的代码将完美地工作。然而,实际上,同一个扭曲中可能有两个线程访问同一个锁。如果其中一个线程获得了锁,它只会忘记执行循环,但在其扭曲中的所有其他线程完成循环之前,它无法继续通过循环。不幸的是,另一个线程永远不会完成,因为它正在等待第一个线程解锁

下面是一个内核,它可以毫无错误地完成这项任务:

__global__ void k_testLocking(unsigned int* locks, int n) {
    int id = threadIdx.x % n;
    bool leaveLoop = false;
    while (!leaveLoop) {
        if (atomicExch(&(locks[id]), 1u) == 0u) {
            //critical section
            leaveLoop = true;
            atomicExch(&(locks[id]),0u);
        }
    } 
}

顺便说一句,你必须记住全局内存是写和写的!在代码中写入读取时,读取未完成。。。因此,要实现这一点,您需要添加一个全局memfence,即threadfence()

海报已经找到了他自己问题的答案。尽管如此,在下面的代码中,我提供了一个通用框架来实现CUDA中的一个关键部分。更详细地说,该代码执行块计数,但它很容易修改,以承载在关键部分执行的其他操作。下面,我还报告了一些对代码的解释,以及在CUDA中实现关键部分时出现的一些“典型”错误

代码

#include <stdio.h>

#include "Utilities.cuh"

#define NUMBLOCKS  512
#define NUMTHREADS 512 * 2

/***************/
/* LOCK STRUCT */
/***************/
struct Lock {

    int *d_state;

    // --- Constructor
    Lock(void) {
        int h_state = 0;                                        // --- Host side lock state initializer
        gpuErrchk(cudaMalloc((void **)&d_state, sizeof(int)));  // --- Allocate device side lock state
        gpuErrchk(cudaMemcpy(d_state, &h_state, sizeof(int), cudaMemcpyHostToDevice)); // --- Initialize device side lock state
    }

    // --- Destructor
    __host__ __device__ ~Lock(void) { 
#if !defined(__CUDACC__)
        gpuErrchk(cudaFree(d_state)); 
#else

#endif  
    }

    // --- Lock function
    __device__ void lock(void) { while (atomicCAS(d_state, 0, 1) != 0); }

    // --- Unlock function
    __device__ void unlock(void) { atomicExch(d_state, 0); }
};

/*************************************/
/* BLOCK COUNTER KERNEL WITHOUT LOCK */
/*************************************/
__global__ void blockCountingKernelNoLock(int *numBlocks) {

    if (threadIdx.x == 0) { numBlocks[0] = numBlocks[0] + 1; }
}

/**********************************/
/* BLOCK COUNTER KERNEL WITH LOCK */
/**********************************/
__global__ void blockCountingKernelLock(Lock lock, int *numBlocks) {

    if (threadIdx.x == 0) {
        lock.lock();
        numBlocks[0] = numBlocks[0] + 1;
        lock.unlock();
    }
}

/****************************************/
/* BLOCK COUNTER KERNEL WITH WRONG LOCK */
/****************************************/
__global__ void blockCountingKernelDeadlock(Lock lock, int *numBlocks) {

    lock.lock();
    if (threadIdx.x == 0) { numBlocks[0] = numBlocks[0] + 1; }
    lock.unlock();
}

/********/
/* MAIN */
/********/
int main(){

    int h_counting, *d_counting;
    Lock lock;

    gpuErrchk(cudaMalloc(&d_counting, sizeof(int)));

    // --- Unlocked case
    h_counting = 0;
    gpuErrchk(cudaMemcpy(d_counting, &h_counting, sizeof(int), cudaMemcpyHostToDevice));

    blockCountingKernelNoLock << <NUMBLOCKS, NUMTHREADS >> >(d_counting);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(&h_counting, d_counting, sizeof(int), cudaMemcpyDeviceToHost));
    printf("Counting in the unlocked case: %i\n", h_counting);

    // --- Locked case
    h_counting = 0;
    gpuErrchk(cudaMemcpy(d_counting, &h_counting, sizeof(int), cudaMemcpyHostToDevice));

    blockCountingKernelLock << <NUMBLOCKS, NUMTHREADS >> >(lock, d_counting);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(&h_counting, d_counting, sizeof(int), cudaMemcpyDeviceToHost));
    printf("Counting in the locked case: %i\n", h_counting);

    gpuErrchk(cudaFree(d_counting));
}
#包括
#包括“Utilities.cuh”
#定义数字锁512
#定义NUMTHREADS 512*2
/***************/
/*锁结构*/
/***************/
结构锁{
int*d_州;
//---建造师
锁(空){
int h_state=0;//---主机端锁定状态初始值设定项
gpuErrchk(cudaMalloc((void**)和d_状态,sizeof(int));/——分配设备侧锁状态
gpuErrchk(cudaMemcpy(d_状态和h_状态,sizeof(int),cudaMemcpyHostToDevice));/——初始化设备侧锁状态
}
//---析构函数
__主机设备锁(无效){
#如果!已定义(\uuu CUDACC\uuuu)
gpuErrchk(cudaFree(杜邦));
#否则
#恩迪夫
}
//---锁定功能
__设备无效锁(无效){while(原子状态,0,1)!=0)}
//---解锁功能
__设备uuu无效解锁(无效){atomicExch(d_状态,0);}
};
/*************************************/
/*无锁块计数器内核*/
/*************************************/
__全局无效块计数内核锁(int*numBlocks){
如果(threadIdx.x==0){numBlocks[0]=numBlocks[0]+1;}
}
/**********************************/
/*带锁的块计数器内核*/
/**********************************/
__全局无效块计数内核锁(锁锁锁,int*numBlocks){
if(threadIdx.x==0){
lock.lock();
numBlocks[0]=numBlocks[0]+1;
lock.unlock();
}
}
/****************************************/
/*带错误锁的块计数器内核*/
/****************************************/
__全局\uuuuvoid blockcountingkerneldadlock(Lock Lock,int*numBlocks){
lock.lock();
如果(threadIdx.x==0){numBlocks[0]=numBlocks[0]+1;}
lock.unlock();
}
/********/
/*主要*/
/********/
int main(){
整数h_计数,*d_计数;
锁;
gpuerchk(cudaMalloc&d_计数,sizeof(int));
//---未上锁的箱子
h_计数=0;
gpuErrchk(cudaMemcpy(d_计数和h_计数,sizeof(int),cudaMemcpyHostToDevice));
blockCountingKernelNoLock>(d_计数);
gpuerchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuerchk(cudaMemcpy(&h_计数、d_计数、sizeof(int)、cudaMemcpyDeviceToHost));
printf(“在解锁的情况下计数:%i\n”,h\u计数);
//---带锁的箱子
h_计数=0;
gpuErrchk(cudaMemcpy(d_计数和h_计数,sizeof(int),cudaMemcpyHostToDevice));
blockCountingKernelLock>(锁定,d_计数);
gpuerchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuerchk(cudaMemcpy(&h_计数、d_计数、sizeof(int)、cudaMemcpyDeviceToHost));
printf(“在锁定的情况下计数:%i\n”,h\u计数);
gpuErrchk(cudaFree(d_计数));
}
代码解释

关键部分是必须由CUDA线程顺序执行的操作序列

假设构造一个内核,其任务是计算线程网格的线程块数。一个可能的想法是让每个块中具有
threadIdx.x==0的每个线程增加一个全局计数器。为了防止竞争条件,所有的增加都必须按顺序发生,因此必须将它们合并到临界部分

上面的代码有两个内核函数:
blockCountingKernelNoLock
blockCountingKernelLock
。前者不使用临界段来增加计数器,可以看出,它返回错误的结果。后者将计数器增量封装在临界段内,因此产生正确的结果。但是关键部分是如何工作的呢

临界段由全局状态
d_状态
控制。最初,状态为
0
。此外,两种
\uuu设备
方法,
锁定
解锁
可以更改此状态。
lock
unlock
方法只能由每个块中的单个线程调用,特别是由具有本地线程索引
threadIdx.x==0的线程调用

在执行过程中,具有本地线程索引
threadIdx.x==0
和全局线程索引(例如,
t
)的线程之一将是第一个调用
lock
方法的线程。特别是,它将启动
atomicCAS(d
__global__ void k_testLocking(unsigned int* locks, int n) {
    int id = threadIdx.x % n;
    bool leaveLoop = false;
    while (!leaveLoop) {
        if (atomicExch(&(locks[id]), 1u) == 0u) {
            //critical section
            leaveLoop = true;
            atomicExch(&(locks[id]),0u);
        }
    } 
}
#include <stdio.h>

#include "Utilities.cuh"

#define NUMBLOCKS  512
#define NUMTHREADS 512 * 2

/***************/
/* LOCK STRUCT */
/***************/
struct Lock {

    int *d_state;

    // --- Constructor
    Lock(void) {
        int h_state = 0;                                        // --- Host side lock state initializer
        gpuErrchk(cudaMalloc((void **)&d_state, sizeof(int)));  // --- Allocate device side lock state
        gpuErrchk(cudaMemcpy(d_state, &h_state, sizeof(int), cudaMemcpyHostToDevice)); // --- Initialize device side lock state
    }

    // --- Destructor
    __host__ __device__ ~Lock(void) { 
#if !defined(__CUDACC__)
        gpuErrchk(cudaFree(d_state)); 
#else

#endif  
    }

    // --- Lock function
    __device__ void lock(void) { while (atomicCAS(d_state, 0, 1) != 0); }

    // --- Unlock function
    __device__ void unlock(void) { atomicExch(d_state, 0); }
};

/*************************************/
/* BLOCK COUNTER KERNEL WITHOUT LOCK */
/*************************************/
__global__ void blockCountingKernelNoLock(int *numBlocks) {

    if (threadIdx.x == 0) { numBlocks[0] = numBlocks[0] + 1; }
}

/**********************************/
/* BLOCK COUNTER KERNEL WITH LOCK */
/**********************************/
__global__ void blockCountingKernelLock(Lock lock, int *numBlocks) {

    if (threadIdx.x == 0) {
        lock.lock();
        numBlocks[0] = numBlocks[0] + 1;
        lock.unlock();
    }
}

/****************************************/
/* BLOCK COUNTER KERNEL WITH WRONG LOCK */
/****************************************/
__global__ void blockCountingKernelDeadlock(Lock lock, int *numBlocks) {

    lock.lock();
    if (threadIdx.x == 0) { numBlocks[0] = numBlocks[0] + 1; }
    lock.unlock();
}

/********/
/* MAIN */
/********/
int main(){

    int h_counting, *d_counting;
    Lock lock;

    gpuErrchk(cudaMalloc(&d_counting, sizeof(int)));

    // --- Unlocked case
    h_counting = 0;
    gpuErrchk(cudaMemcpy(d_counting, &h_counting, sizeof(int), cudaMemcpyHostToDevice));

    blockCountingKernelNoLock << <NUMBLOCKS, NUMTHREADS >> >(d_counting);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(&h_counting, d_counting, sizeof(int), cudaMemcpyDeviceToHost));
    printf("Counting in the unlocked case: %i\n", h_counting);

    // --- Locked case
    h_counting = 0;
    gpuErrchk(cudaMemcpy(d_counting, &h_counting, sizeof(int), cudaMemcpyHostToDevice));

    blockCountingKernelLock << <NUMBLOCKS, NUMTHREADS >> >(lock, d_counting);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(&h_counting, d_counting, sizeof(int), cudaMemcpyDeviceToHost));
    printf("Counting in the locked case: %i\n", h_counting);

    gpuErrchk(cudaFree(d_counting));
}