Cuda 在缩减期间终止非活动线程
我见过的大多数减量看起来像: for( i = N; i > 0; i /=2 ) { if( tid < i ) assign-shared; __syncthreads(); } if( tid == 0 ) copy-value-to-global; 对于(i=N;i>0;i/=2){ 如果(tid for( i = N; i > 0; i /= 2 ) { if( tid >= i ) return; assign-shared; __syncthreads(); } copy-value-to-global; 对于(i=N;i>0;i/=2){ 如果(tid>=i) 返回; 分配共享资源; __同步线程(); } 将值复制到全局;Cuda 在缩减期间终止非活动线程,cuda,Cuda,我见过的大多数减量看起来像: for( i = N; i > 0; i /=2 ) { if( tid < i ) assign-shared; __syncthreads(); } if( tid == 0 ) copy-value-to-global; 对于(i=N;i>0;i/=2){ 如果(tid 0; i /= 2 ) { if( tid >= i ) return; assign-shared; _
并注意到了显著的性能优势。让不再参与还原的线程提前返回有什么缺点吗 因为您已经在用原始代码执行
if
语句,所以我看不出有任何缺点
如果
If
语句的结果没有空间局部性(整个块的结果通常相同),则可能看不到任何加速。此外,加速可能取决于您设备的功能:早期的CUDA设备可能无法为您提供性能增强。第二个代码段提供更好的性能,因为未使用的扭曲不需要返回并执行分支检查
理想情况下,在第二种情况下,您将在每次迭代中退出一个扭曲,以减少GPU上的负载 dolan在其上述评论中提出了一个问题,即William Pursell提出的方案将陷入僵局,据报道。关于这个问题,我想说,根据,大多数GPU上的代码不会死锁,因为它们支持提前退出,因为在这些GPU中,硬件为每个块维护一个活动线程计数:该计数然后用于屏障同步,而不是块的初始线程计数 我考虑了
reduce4
CUDA SDK示例,并根据OP的问题对其进行了修改。也就是说,我正在比较两个\uuuuu全局\uuuu
函数:
原创的
我还没有检查其他体系结构的计时,但对于某些GPU来说,陷入死锁的风险可能不值得达到的加速(前提是可达到的加速保持相同的数量级)。第二个代码将导致死锁。请看我的问题@cicada——谢谢你的链接
template <class T>
__global__ void reduce4(T *g_idata, T *g_odata, unsigned int N)
{
extern __shared__ T sdata[];
unsigned int tid = threadIdx.x; // Local thread index
unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; // Global thread index - Fictitiously double the block dimension
// --- Performs the first level of reduction in registers when reading from global memory.
T mySum = (i < N) ? g_idata[i] : 0;
if (i + blockDim.x < N) mySum += g_idata[i+blockDim.x];
sdata[tid] = mySum;
// --- Before going further, we have to make sure that all the shared memory loads have been completed
__syncthreads();
// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s=blockDim.x/2; s>32; s>>=1)
{
if (tid < s) { sdata[tid] = mySum = mySum + sdata[tid + s]; }
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
__syncthreads();
}
// --- Single warp reduction by loop unrolling. Assuming blockDim.x >64
if (tid < 32) {
sdata[tid] = mySum = mySum + sdata[tid + 32]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 16]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 8]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 4]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 2]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 1]; __syncthreads();
}
// --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
// individual blocks
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
template <class T>
__global__ void reduce4_deadlock_test(T *g_idata, T *g_odata, unsigned int N)
{
extern __shared__ T sdata[];
unsigned int tid = threadIdx.x; // Local thread index
unsigned int i = blockIdx.x*(blockDim.x*2) + threadIdx.x; // Global thread index - Fictitiously double the block dimension
// --- Performs the first level of reduction in registers when reading from global memory.
T mySum = (i < N) ? g_idata[i] : 0;
if (i + blockDim.x < N) mySum += g_idata[i+blockDim.x];
sdata[tid] = mySum;
// --- Before going further, we have to make sure that all the shared memory loads have been completed
__syncthreads();
// --- Reduction in shared memory. Only half of the threads contribute to reduction.
for (unsigned int s=blockDim.x/2; s>32; s>>=1)
{
if (tid >= s) return;
sdata[tid] = mySum = mySum + sdata[tid + s];
// --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
__syncthreads();
}
// --- Single warp reduction by loop unrolling. Assuming blockDim.x >64
if (tid < 32) {
sdata[tid] = mySum = mySum + sdata[tid + 32]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 16]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 8]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 4]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 2]; __syncthreads();
sdata[tid] = mySum = mySum + sdata[tid + 1]; __syncthreads();
}
// --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
// individual blocks
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
N Original Modified
131072 0.021 0.019
262144 0.030 0.032
524288 0.052 0.052
1048576 0.091 0.080
2097152 0.165 0.146
4194304 0.323 0.286
8388608 0.637 0.555
16777216 1.264 1.122
33554432 2.514 2.189