CUDA-为什么基于经纱的并行还原速度较慢?
我有一个基于扭曲的并行缩减的想法,因为根据定义,扭曲的所有线程都是同步的 因此,我们的想法是,输入数据可以减少64倍(每个线程减少两个元素),而不需要任何同步 与Mark Harris最初的实现相同,缩减应用于块级别,数据位于共享内存中 我创建了一个内核来测试他的版本和我的基于warp的版本。CUDA-为什么基于经纱的并行还原速度较慢?,cuda,gpgpu,reduction,Cuda,Gpgpu,Reduction,我有一个基于扭曲的并行缩减的想法,因为根据定义,扭曲的所有线程都是同步的 因此,我们的想法是,输入数据可以减少64倍(每个线程减少两个元素),而不需要任何同步 与Mark Harris最初的实现相同,缩减应用于块级别,数据位于共享内存中 我创建了一个内核来测试他的版本和我的基于warp的版本。 内核本身在共享内存中完全相同地存储块大小的元素,并在输出数组中以其唯一的块索引输出其结果 算法本身运行良好。用一个完整的数组来测试“计数” 实现的功能体: /** * Performs a parall
内核本身在共享内存中完全相同地存储块大小的元素,并在输出数组中以其唯一的块索引输出其结果 算法本身运行良好。用一个完整的数组来测试“计数” 实现的功能体:
/**
* Performs a parallel reduction with operator add
* on the given array and writes the result with the thread 0
* to the given target value
*
* @param inValues T* Input float array, length must be a multiple of 2 and equal to blockDim.x
* @param targetValue float
*/
__device__ void reductionAddBlockThread_f(float* inValues,
float &outTargetVar)
{
// code of the below functions
}
if (blockDim.x >= 1024 && threadIdx.x < 512)
inValues[threadIdx.x] += inValues[threadIdx.x + 512];
__syncthreads();
if (blockDim.x >= 512 && threadIdx.x < 256)
inValues[threadIdx.x] += inValues[threadIdx.x + 256];
__syncthreads();
if (blockDim.x >= 256 && threadIdx.x < 128)
inValues[threadIdx.x] += inValues[threadIdx.x + 128];
__syncthreads();
if (blockDim.x >= 128 && threadIdx.x < 64)
inValues[threadIdx.x] += inValues[threadIdx.x + 64];
__syncthreads();
//unroll last warp no sync needed
if (threadIdx.x < 32)
{
if (blockDim.x >= 64) inValues[threadIdx.x] += inValues[threadIdx.x + 32];
if (blockDim.x >= 32) inValues[threadIdx.x] += inValues[threadIdx.x + 16];
if (blockDim.x >= 16) inValues[threadIdx.x] += inValues[threadIdx.x + 8];
if (blockDim.x >= 8) inValues[threadIdx.x] += inValues[threadIdx.x + 4];
if (blockDim.x >= 4) inValues[threadIdx.x] += inValues[threadIdx.x + 2];
if (blockDim.x >= 2) inValues[threadIdx.x] += inValues[threadIdx.x + 1];
//set final value
if (threadIdx.x == 0)
outTargetVar = inValues[0];
}
1。his版本的实施:
/**
* Performs a parallel reduction with operator add
* on the given array and writes the result with the thread 0
* to the given target value
*
* @param inValues T* Input float array, length must be a multiple of 2 and equal to blockDim.x
* @param targetValue float
*/
__device__ void reductionAddBlockThread_f(float* inValues,
float &outTargetVar)
{
// code of the below functions
}
if (blockDim.x >= 1024 && threadIdx.x < 512)
inValues[threadIdx.x] += inValues[threadIdx.x + 512];
__syncthreads();
if (blockDim.x >= 512 && threadIdx.x < 256)
inValues[threadIdx.x] += inValues[threadIdx.x + 256];
__syncthreads();
if (blockDim.x >= 256 && threadIdx.x < 128)
inValues[threadIdx.x] += inValues[threadIdx.x + 128];
__syncthreads();
if (blockDim.x >= 128 && threadIdx.x < 64)
inValues[threadIdx.x] += inValues[threadIdx.x + 64];
__syncthreads();
//unroll last warp no sync needed
if (threadIdx.x < 32)
{
if (blockDim.x >= 64) inValues[threadIdx.x] += inValues[threadIdx.x + 32];
if (blockDim.x >= 32) inValues[threadIdx.x] += inValues[threadIdx.x + 16];
if (blockDim.x >= 16) inValues[threadIdx.x] += inValues[threadIdx.x + 8];
if (blockDim.x >= 8) inValues[threadIdx.x] += inValues[threadIdx.x + 4];
if (blockDim.x >= 4) inValues[threadIdx.x] += inValues[threadIdx.x + 2];
if (blockDim.x >= 2) inValues[threadIdx.x] += inValues[threadIdx.x + 1];
//set final value
if (threadIdx.x == 0)
outTargetVar = inValues[0];
}
if(blockDim.x>=1024&&threadIdx.x<512)
无效[threadIdx.x]+=无效[threadIdx.x+512];
__同步线程();
如果(blockDim.x>=512&&threadIdx.x<256)
无效[threadIdx.x]+=无效[threadIdx.x+256];
__同步线程();
如果(blockDim.x>=256&&threadIdx.x<128)
无效[threadIdx.x]+=无效[threadIdx.x+128];
__同步线程();
如果(blockDim.x>=128&&threadIdx.x<64)
无效[threadIdx.x]+=无效[threadIdx.x+64];
__同步线程();
//展开最后一次扭曲无需同步
如果(螺纹内径x.x<32)
{
如果(blockDim.x>=64)无效[threadIdx.x]+=inValues[threadIdx.x+32];
如果(blockDim.x>=32)无效[threadIdx.x]+=inValues[threadIdx.x+16];
如果(blockDim.x>=16)无效[threadIdx.x]+=inValues[threadIdx.x+8];
如果(blockDim.x>=8)无效[threadIdx.x]+=inValues[threadIdx.x+4];
如果(blockDim.x>=4)无效[threadIdx.x]+=inValues[threadIdx.x+2];
如果(blockDim.x>=2)无效[threadIdx.x]+=inValues[threadIdx.x+1];
//设定最终值
if(threadIdx.x==0)
outTargetVar=无效[0];
}
资源:
/**
* Performs a parallel reduction with operator add
* on the given array and writes the result with the thread 0
* to the given target value
*
* @param inValues T* Input float array, length must be a multiple of 2 and equal to blockDim.x
* @param targetValue float
*/
__device__ void reductionAddBlockThread_f(float* inValues,
float &outTargetVar)
{
// code of the below functions
}
if (blockDim.x >= 1024 && threadIdx.x < 512)
inValues[threadIdx.x] += inValues[threadIdx.x + 512];
__syncthreads();
if (blockDim.x >= 512 && threadIdx.x < 256)
inValues[threadIdx.x] += inValues[threadIdx.x + 256];
__syncthreads();
if (blockDim.x >= 256 && threadIdx.x < 128)
inValues[threadIdx.x] += inValues[threadIdx.x + 128];
__syncthreads();
if (blockDim.x >= 128 && threadIdx.x < 64)
inValues[threadIdx.x] += inValues[threadIdx.x + 64];
__syncthreads();
//unroll last warp no sync needed
if (threadIdx.x < 32)
{
if (blockDim.x >= 64) inValues[threadIdx.x] += inValues[threadIdx.x + 32];
if (blockDim.x >= 32) inValues[threadIdx.x] += inValues[threadIdx.x + 16];
if (blockDim.x >= 16) inValues[threadIdx.x] += inValues[threadIdx.x + 8];
if (blockDim.x >= 8) inValues[threadIdx.x] += inValues[threadIdx.x + 4];
if (blockDim.x >= 4) inValues[threadIdx.x] += inValues[threadIdx.x + 2];
if (blockDim.x >= 2) inValues[threadIdx.x] += inValues[threadIdx.x + 1];
//set final value
if (threadIdx.x == 0)
outTargetVar = inValues[0];
}
使用了4个同步线程12如果使用了if语句
11读+加+写操作
1最终写入操作
5注册使用 性能:
/**
* Performs a parallel reduction with operator add
* on the given array and writes the result with the thread 0
* to the given target value
*
* @param inValues T* Input float array, length must be a multiple of 2 and equal to blockDim.x
* @param targetValue float
*/
__device__ void reductionAddBlockThread_f(float* inValues,
float &outTargetVar)
{
// code of the below functions
}
if (blockDim.x >= 1024 && threadIdx.x < 512)
inValues[threadIdx.x] += inValues[threadIdx.x + 512];
__syncthreads();
if (blockDim.x >= 512 && threadIdx.x < 256)
inValues[threadIdx.x] += inValues[threadIdx.x + 256];
__syncthreads();
if (blockDim.x >= 256 && threadIdx.x < 128)
inValues[threadIdx.x] += inValues[threadIdx.x + 128];
__syncthreads();
if (blockDim.x >= 128 && threadIdx.x < 64)
inValues[threadIdx.x] += inValues[threadIdx.x + 64];
__syncthreads();
//unroll last warp no sync needed
if (threadIdx.x < 32)
{
if (blockDim.x >= 64) inValues[threadIdx.x] += inValues[threadIdx.x + 32];
if (blockDim.x >= 32) inValues[threadIdx.x] += inValues[threadIdx.x + 16];
if (blockDim.x >= 16) inValues[threadIdx.x] += inValues[threadIdx.x + 8];
if (blockDim.x >= 8) inValues[threadIdx.x] += inValues[threadIdx.x + 4];
if (blockDim.x >= 4) inValues[threadIdx.x] += inValues[threadIdx.x + 2];
if (blockDim.x >= 2) inValues[threadIdx.x] += inValues[threadIdx.x + 1];
//set final value
if (threadIdx.x == 0)
outTargetVar = inValues[0];
}
平均五次测试运行:~19.54毫秒
2。基于扭曲的方法:(与上述功能体相同)
/*
*按64的系数执行第一次基于扭曲的缩减
*
*每经32线程->日志2(32)=5
*
*1024个线程/32个线程/扭曲=32个扭曲
*每个线程比较2个元素->32*2=每个扭曲64个元素
*
*1024个线程/元素除以64=16
*
*只有一半的扭曲/线程处于活动状态
*/
如果(threadIdx.x>1)
{
const unsigned int warpId=threadIdx.x>>5;
//备选螺纹IDX.x和31
const unsigned int threadWarpId=threadIdx.x-(warpId=256)inValues[warpIdx]+=inValues[warpIdx+128];
如果(blockDim.x>=128)无效[warpIdx]+=inValues[warpIdx+64];
//设定最终值
if(threadIdx.x==0)
outTargetVar=无效[0];
}
资源:
/**
* Performs a parallel reduction with operator add
* on the given array and writes the result with the thread 0
* to the given target value
*
* @param inValues T* Input float array, length must be a multiple of 2 and equal to blockDim.x
* @param targetValue float
*/
__device__ void reductionAddBlockThread_f(float* inValues,
float &outTargetVar)
{
// code of the below functions
}
if (blockDim.x >= 1024 && threadIdx.x < 512)
inValues[threadIdx.x] += inValues[threadIdx.x + 512];
__syncthreads();
if (blockDim.x >= 512 && threadIdx.x < 256)
inValues[threadIdx.x] += inValues[threadIdx.x + 256];
__syncthreads();
if (blockDim.x >= 256 && threadIdx.x < 128)
inValues[threadIdx.x] += inValues[threadIdx.x + 128];
__syncthreads();
if (blockDim.x >= 128 && threadIdx.x < 64)
inValues[threadIdx.x] += inValues[threadIdx.x + 64];
__syncthreads();
//unroll last warp no sync needed
if (threadIdx.x < 32)
{
if (blockDim.x >= 64) inValues[threadIdx.x] += inValues[threadIdx.x + 32];
if (blockDim.x >= 32) inValues[threadIdx.x] += inValues[threadIdx.x + 16];
if (blockDim.x >= 16) inValues[threadIdx.x] += inValues[threadIdx.x + 8];
if (blockDim.x >= 8) inValues[threadIdx.x] += inValues[threadIdx.x + 4];
if (blockDim.x >= 4) inValues[threadIdx.x] += inValues[threadIdx.x + 2];
if (blockDim.x >= 2) inValues[threadIdx.x] += inValues[threadIdx.x + 1];
//set final value
if (threadIdx.x == 0)
outTargetVar = inValues[0];
}
使用了1个同步线程7如果语句
10读加写操作
1最终写入操作
5注册使用 5位移位
1添加
1分 性能:
/**
* Performs a parallel reduction with operator add
* on the given array and writes the result with the thread 0
* to the given target value
*
* @param inValues T* Input float array, length must be a multiple of 2 and equal to blockDim.x
* @param targetValue float
*/
__device__ void reductionAddBlockThread_f(float* inValues,
float &outTargetVar)
{
// code of the below functions
}
if (blockDim.x >= 1024 && threadIdx.x < 512)
inValues[threadIdx.x] += inValues[threadIdx.x + 512];
__syncthreads();
if (blockDim.x >= 512 && threadIdx.x < 256)
inValues[threadIdx.x] += inValues[threadIdx.x + 256];
__syncthreads();
if (blockDim.x >= 256 && threadIdx.x < 128)
inValues[threadIdx.x] += inValues[threadIdx.x + 128];
__syncthreads();
if (blockDim.x >= 128 && threadIdx.x < 64)
inValues[threadIdx.x] += inValues[threadIdx.x + 64];
__syncthreads();
//unroll last warp no sync needed
if (threadIdx.x < 32)
{
if (blockDim.x >= 64) inValues[threadIdx.x] += inValues[threadIdx.x + 32];
if (blockDim.x >= 32) inValues[threadIdx.x] += inValues[threadIdx.x + 16];
if (blockDim.x >= 16) inValues[threadIdx.x] += inValues[threadIdx.x + 8];
if (blockDim.x >= 8) inValues[threadIdx.x] += inValues[threadIdx.x + 4];
if (blockDim.x >= 4) inValues[threadIdx.x] += inValues[threadIdx.x + 2];
if (blockDim.x >= 2) inValues[threadIdx.x] += inValues[threadIdx.x + 1];
//set final value
if (threadIdx.x == 0)
outTargetVar = inValues[0];
}
平均五次测试运行:~20.82毫秒
在浮点值为256mb的Geforce 8800 GT 512 mb上多次测试两个内核。
并以每个块256个线程的速度运行内核(100%占用率)
基于warp的版本速度慢了~1.28毫秒
如果未来的卡允许更大的块大小,基于扭曲的方法仍然不需要进一步的同步语句,因为最大值是4096,减少到64,最终扭曲减少到1
为什么速度不够快?或者内核理念中的缺陷在哪里?
从Resources使用情况来看,warp方法应该领先
Edit1:更正了内核中只有一半线程处于活动状态,没有导致越界读取,添加了新的性能数据我认为您的代码比我的代码慢的原因是,在我的代码中,在第一阶段,每个外接程序中有一半的扭曲处于活动状态。在代码中,第一阶段的所有扭曲都处于活动状态。因此,总体而言,您的代码执行更多的扭曲指令。在CUDA中,考虑总的“扭曲指令”是重要的,而不仅仅是由一个扭曲执行的指令数量。 而且,只使用一半的翘曲也没有意义。在启动warp时,只有让它们评估两个分支并退出,这会带来开销 另一个想法是,使用
无符号字符
和短字符
实际上可能会降低性能。我不确定,但它肯定不会保存寄存器,因为它们没有打包到单个32位变量中
另外,在我的原始代码中,我用一个模板参数blockDim替换了blockDim.x,这意味着它只使用了5个运行时if语句(第二阶段中的if被编译器消除)
顺便说一句,计算threadWarpId
的一种更便宜的方法是
const int threadWarpId = threadIdx.x & 31;
你可以查看更多的想法
编辑:
这是另一种基于扭曲的块缩减
template <typename T, int level>
__device__
void sumReduceWarp(volatile T *sdata, const unsigned int tid)
{
T t = sdata[tid];
if (level > 5) sdata[tid] = t = t + sdata[tid + 32];
if (level > 4) sdata[tid] = t = t + sdata[tid + 16];
if (level > 3) sdata[tid] = t = t + sdata[tid + 8];
if (level > 2) sdata[tid] = t = t + sdata[tid + 4];
if (level > 1) sdata[tid] = t = t + sdata[tid + 2];
if (level > 0) sdata[tid] = t = t + sdata[tid + 1];
}
template <typename T>
__device__
void sumReduceBlock(T *output, volatile T *sdata)
{
// sdata is a shared array of length 2 * blockDim.x
const unsigned int warp = threadIdx.x >> 5;
const unsigned int lane = threadIdx.x & 31;
const unsigned int tid = (warp << 6) + lane;
sumReduceWarp<T, 5>(sdata, tid);
__syncthreads();
// lane 0 of each warp now contains the sum of two warp's values
if (lane == 0) sdata[warp] = sdata[tid];
__syncthreads();
if (warp == 0) {
sumReduceWarp<T, 4>(sdata, threadIdx.x);
if (lane == 0) *output = sdata[0];
}
}
模板
__装置__
无效集水坑(挥发性T*sdata,常数无符号整数tid)
{
T=sdata[tid];
若(5级以上)sdata[tid]=t=t+sdata[tid+32];
若(4级以上)sdata[tid]=t=t+sdata[tid+16];
如果(3级以上)sdata[tid]=t=t+sdata[tid+8];
若(2级以上)sdata[tid]=t=t+sdata[tid+4];
若(1级以上)sdata[tid]=t=t+sdata[tid+2];
如果(级别>0)sdata[tid]=t=t+sdata[tid+1];
}
模板
__装置__
无效sumReduceBlock(T*输出,挥发性T*sdata)
{
//sdata是长度为2*blockDim.x的共享数组
常量unsigned int warp=threadIdx.x>>5;
const unsigned int lane=threadIdx.x&31;
常量无符号整数tid