如何跟踪已执行的CUDA块?
为了测试我对事物的理解,我决定修改CUDA样本中的向量加法,以便内核在特定时间后退出,然后重新启动以完成。我实现超时的方法是在一段时间后将主机设置为1的固定变量。在内核中,将执行此变量的检查,以确定是否应继续执行。如果线程继续执行,它将被标记为完成。为了测试每个线程只执行一次,我修改了C[I]=C[I]+B[I]的加法,这一切都能按预期工作;设备代码如下所示:如何跟踪已执行的CUDA块?,cuda,Cuda,为了测试我对事物的理解,我决定修改CUDA样本中的向量加法,以便内核在特定时间后退出,然后重新启动以完成。我实现超时的方法是在一段时间后将主机设置为1的固定变量。在内核中,将执行此变量的检查,以确定是否应继续执行。如果线程继续执行,它将被标记为完成。为了测试每个线程只执行一次,我修改了C[I]=C[I]+B[I]的加法,这一切都能按预期工作;设备代码如下所示: /* Function * Internal device function used for getting the curre
/* Function
* Internal device function used for getting the current thread's global ID
* regardless of the block/grid configuration. It assumes that the
* grid and block are 3 dimensional.
*
* @return: The thread's global ID
*/
static __device__ int get_global_idx()
{
int blockId = blockIdx.x
+ blockIdx.y * gridDim.x
+ gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z)
+ (threadIdx.z * (blockDim.x * blockDim.y))
+ (threadIdx.y * blockDim.x)
+ threadIdx.x;
return threadId;
}
/* Function
* Device function that determines if the current thread should continue execution.
* A check should be used on the return value. If the timeout has not been set
* and the thread has not previously executed the index at the thread's ID in the
* thread_ids array is set to 1 to indicate it was allowed to proceed.
*
* @param thread_ids: A pointer to the array with a size that matches the max number
* of threads that will be spawned
*
* @param time_out: Memory mapped variable used by the host to signal the kernel when
* execution should suspend
*
* @return: A boolean value indicating whether the current thread should continue or not
*/
__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out)
{
if(*time_out == 1){
return false;
}
int tid = get_global_idx();
if(thread_ids[tid] == 1)
{
return false;
}
thread_ids[tid] = 1;
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, unsigned int *thread_ids, volatile unsigned int *timeout)
{
if(!continue(thread_ids, timeout))
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
/* C[i] = A[i] + B[i]; */
C[i] = C[i] + B[i]; //Modifed from above
}
}
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue(unsigned int *thread_ids, volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
这不起作用,当我在主机h_B[I]-h_C[I]上执行验证时,我没有得到一致的零结果。这意味着某些线程以某种方式设法执行多次。你知道后一种尝试是如何/为什么发生这种情况的吗?谢谢
在这一点上,我不在乎表现;只是想了解到底发生了什么
编辑
这是完整的代码,用nvcc文件_name.cu编译并执行程序_name
您在继续执行例程中有一个竞争条件。考虑下面的场景: threadblock的warp0进入continue_执行例程。在它检查变量*time_out和b_info[bid]的那一刻,它开始见证这些变量分别为0和0。因此,它继续进行下一个if测试。 同一threadblock的warp1进入continue_执行例程(比如稍晚一点),它见证变量分别为1和0。因此它返回false并导致warp1线程退出。 warp0将继续并最终设置b_info[bid]。开始设置为1,然后更新线程计数。然后返回true并继续向量相加。 我可以继续这样做,但是我认为如果你仔细考虑以上3个项目,你会意识到这是一个你没有解释的情况。您隐含的期望是,每个线程都会在给定的threadblock值上读取一个连贯的值,即相同的值,持续*time_out。但您的代码并不能保证这一点,如果它不能做到这一点,那么我们将得到一些线程块,其中一些线程已经完成了它们的工作,而一些线程还没有完成 那么我们如何解决这个问题呢?上面的描述应该指明方向。一种可能的方法是保证对于任何给定的threadblock,每个线程都会获得相同的*timeout值,无论它是1还是0。一种可能的解决方案是对vectorAdd内核的开头进行以下更改: 通过这些更改,我们确保块中的每个线程都能获得超时变量的一致视图,根据我的测试,问题得到了解决:
$ cat t100.cu
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info)
{
#ifdef USE_FIX
__shared__ volatile unsigned int my_time_out;
if (!threadIdx.x) my_time_out = *time_out;
__syncthreads();
if(!continue_execution(&my_time_out, b_info))
#else
if(!continue_execution(time_out, b_info))
#endif
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
//C[i] = A[i] + B[i];
C[i] = C[i] + B[i]; //Modified from above
}
}
void computation_complete(int *complete, int block_amt, block_info *h_block_info)
{
size_t i;
for(i = 0; i < block_amt; i++)
{
if(h_block_info[i].started == 1)
{
continue;
}
break;
}
*complete = (i == block_amt) ? 1 : 0;
}
int main(int argc, char *argv[])
{
if(argc != 2)
{
fprintf(stderr, "usage: <program-name> <vector-length>\n");
exit(EXIT_FAILURE);
}
// Print the vector length to be used, and compute its size
long numElements = strtol(argv[1], NULL, 10);
size_t size = numElements * sizeof(float);
printf("[Vector addition of %ld elements]\n", numElements);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
h_C[i] = 0.0;
}
float *d_A = NULL;
cudaMalloc((void **)&d_A, size);
float *d_B = NULL;
cudaMalloc((void **)&d_B, size);
float *d_C = NULL;
cudaMalloc((void **)&d_C, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t);
block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes);
for(int i = 0; i < blocksPerGrid; i++)
{
h_block_info[i].started = 0;
h_block_info[i].thread_count = 0;
}
block_info *d_block_info = NULL;
cudaMalloc(&d_block_info, block_info_bytes);
cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice);
volatile unsigned int *timeout = NULL;
cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped);
*timeout = 0;
double quantum = 0.0001 * 1000000.0;
double initial_quantum = quantum;
int complete = 0;
/* Here the kernel launch is looped until all blocks are complete */
while(complete == 0)
{
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info);
usleep(quantum);
*timeout = 1;
cudaDeviceSynchronize();
cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost);
computation_complete(&complete, blocksPerGrid, h_block_info);
if(complete == 0)
{
quantum = quantum + initial_quantum;
*timeout = 0;
}
}
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i)
{
if (fabs(h_B[i] - h_C[i]) > 1e-5)
{
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
cudaDeviceReset();
return 0;
}
$ nvcc -arch=sm_61 -o t100 t100.cu
$ ./t100 327678
[Vector addition of 327678 elements]
Result verification failed at element 0!
$ nvcc -arch=sm_61 -o t100 t100.cu -DUSE_FIX
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$
这与问题无关,但格式说明符与变量类型不匹配。当询问此代码为什么不起作用时,请更改为%ld.进行修复?你是@RobertCrovella,谢谢。我编辑了一个最小的、完整的、可验证的示例。
__shared__ volatile unsigned int my_time_out;
if (!threadIdx.x) my_time_out = *time_out;
__syncthreads();
if(!continue_execution(&my_time_out, b_info))
$ cat t100.cu
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
// For the CUDA runtime routines (prefixed with "cuda_")
#include <cuda_runtime.h>
typedef struct block_info_t{
int started; /* Initialized to zero before any kernel launch */
unsigned int thread_count;
}block_info;
__device__ bool continue_execution(volatile unsigned int *time_out, block_info *b_info)
{
int bid = blockIdx.x + gridDim.x * (blockIdx.y + gridDim.z * blockIdx.z);
unsigned int bsize = blockDim.x * blockDim.y * blockDim.z;
if(*time_out == 1 && b_info[bid].started == 0)
{
return false;
}
if(b_info[bid].thread_count == bsize)
{
return false;
}
b_info[bid].started = 1;
atomicInc(&b_info[bid].thread_count, bsize);
return true;
}
__global__ void
vectorAdd(const float *A, const float *B, float *C, long numElements, volatile unsigned int *time_out, block_info *b_info)
{
#ifdef USE_FIX
__shared__ volatile unsigned int my_time_out;
if (!threadIdx.x) my_time_out = *time_out;
__syncthreads();
if(!continue_execution(&my_time_out, b_info))
#else
if(!continue_execution(time_out, b_info))
#endif
{
return;
}
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements)
{
//C[i] = A[i] + B[i];
C[i] = C[i] + B[i]; //Modified from above
}
}
void computation_complete(int *complete, int block_amt, block_info *h_block_info)
{
size_t i;
for(i = 0; i < block_amt; i++)
{
if(h_block_info[i].started == 1)
{
continue;
}
break;
}
*complete = (i == block_amt) ? 1 : 0;
}
int main(int argc, char *argv[])
{
if(argc != 2)
{
fprintf(stderr, "usage: <program-name> <vector-length>\n");
exit(EXIT_FAILURE);
}
// Print the vector length to be used, and compute its size
long numElements = strtol(argv[1], NULL, 10);
size_t size = numElements * sizeof(float);
printf("[Vector addition of %ld elements]\n", numElements);
float *h_A = (float *)malloc(size);
float *h_B = (float *)malloc(size);
float *h_C = (float *)malloc(size);
// Initialize the host input vectors
for (int i = 0; i < numElements; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
h_C[i] = 0.0;
}
float *d_A = NULL;
cudaMalloc((void **)&d_A, size);
float *d_B = NULL;
cudaMalloc((void **)&d_B, size);
float *d_C = NULL;
cudaMalloc((void **)&d_C, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, size, cudaMemcpyHostToDevice);
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
size_t block_info_bytes = blocksPerGrid * sizeof(struct block_info_t);
block_info *h_block_info = (struct block_info_t *)malloc(block_info_bytes);
for(int i = 0; i < blocksPerGrid; i++)
{
h_block_info[i].started = 0;
h_block_info[i].thread_count = 0;
}
block_info *d_block_info = NULL;
cudaMalloc(&d_block_info, block_info_bytes);
cudaMemcpy(d_block_info, h_block_info, block_info_bytes, cudaMemcpyHostToDevice);
volatile unsigned int *timeout = NULL;
cudaHostAlloc((void **)&timeout, sizeof(volatile unsigned int), cudaHostAllocMapped);
*timeout = 0;
double quantum = 0.0001 * 1000000.0;
double initial_quantum = quantum;
int complete = 0;
/* Here the kernel launch is looped until all blocks are complete */
while(complete == 0)
{
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements, timeout, d_block_info);
usleep(quantum);
*timeout = 1;
cudaDeviceSynchronize();
cudaMemcpy(h_block_info, d_block_info, block_info_bytes, cudaMemcpyDeviceToHost);
computation_complete(&complete, blocksPerGrid, h_block_info);
if(complete == 0)
{
quantum = quantum + initial_quantum;
*timeout = 0;
}
}
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i)
{
if (fabs(h_B[i] - h_C[i]) > 1e-5)
{
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
printf("Test PASSED\n");
// Free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
cudaDeviceReset();
return 0;
}
$ nvcc -arch=sm_61 -o t100 t100.cu
$ ./t100 327678
[Vector addition of 327678 elements]
Result verification failed at element 0!
$ nvcc -arch=sm_61 -o t100 t100.cu -DUSE_FIX
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$ ./t100 327678
[Vector addition of 327678 elements]
Test PASSED
$
printf("[Vector addition of %d elements]\n", numElements);