如何消除CUDA中的连续段
我的内核中有一个连续的部分,它确实在减慢速度。然而,我不知道如何摆脱内部循环。有什么建议吗如何消除CUDA中的连续段,cuda,gpu,gpgpu,nvidia,Cuda,Gpu,Gpgpu,Nvidia,我的内核中有一个连续的部分,它确实在减慢速度。然而,我不知道如何摆脱内部循环。有什么建议吗 __global__ void myKernel( int keep, int inc, int width, int* d_Xnum, int* d_Xco, bool* d_Xvalid,int* d_A ) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx
__global__ void myKernel( int keep, int inc, int width, int* d_Xnum,
int* d_Xco, bool* d_Xvalid,int* d_A )
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int k1;
if( i < keep && j <= i){
int counter = 0;
for(k1 = 0; k1 < inc; k1++){
if(d_Xvalid[j*inc + k1] == 0)
counter += (d_Xvalid[i*inc + d_Xco[j*width + k1]]);
}
d_A[i*keep+j] = inc - d_Xnum[i] - counter;
}
}
keep
约为9000,而inc
约为20000这并不是您问题的确切答案,但它可能可以优化您的代码,并可能有助于您对k1
求和进行并行缩减,因为您可以去掉if(imyKernel(…)
Done!从数学上讲,它非常聪明,但我不明白为什么启动更多线程会比代码慢…@Manolete:它已经工作了吗?线程以扭曲的形式发送到gpu(目前每个扭曲有32个线程)。当线程没有执行任何操作时,它们仍然必须等待其扭曲中的线程完成该操作,从而占据可以计算所需内容的线程的位置。大多数情况下(尽管并非总是如此)许多非活动线程意味着较慢的代码@JeanMoniuc:它可以工作,但有时会给出错误的结果。我们是否遗漏了一些东西?我已经对它进行了少量调试,它工作正常,但我不明白为什么它会随机失败。。。
...
int t = 32;
int b = keep/(32)+1;
int b2 = (inc/32)+1;
dim3 thread (t, t);
dim3 block (b, inc);
// kernel call
myKernel<<<block, thread>>>(k, inc, width, d_Xnum,
d_Xco, d_Xvalid, d_A);
cudaThreadSynchronize();
...
__global__ void myKernel( int keep, int inc, int width, int* d_Xnum,
int* d_Xco, bool* d_Xvalid,int* d_A )
{
int k = blockIdx.x * blockDim.x + threadIdx.x;
int i = (int)(sqrt(0.25+2.0*k)-0.5);
int j = k - i*(i+1)/2;
int k1;
if( i < keep && j < inc){
int counter = 0;
for(k1 = 0; k1 < inc; k1++){
if(d_Xvalid[j*inc + k1] == 0)
counter += (d_Xvalid[i*inc + d_Xco[j*width + k1]]);
}
d_A[i*keep+j] = inc - d_Xnum[i] - counter;
}
}
_________________________________
| i = 0 | i = 0 | i = 0 | i = 0 |
| j = 0 | - | - | - |
_________________________________
| i = 1 | i = 1 | i = 1 | i = 1 |
| j = 0 | j = 1 | - | - |
_________________________________
| i = 2 | i = 2 | i = 2 | i = 2 |
| j = 0 | j = 1 | j = 2 | - |
_________________________________
| i = 3 | i = 3 | i = 3 | i = 3 |
| j = 0 | j = 1 | j = 2 | j = 3 |
_________________________________
_________________________________________________________________________________
| k = 0 | k = 0 | k = 1 | k = 0 | k = 1 | k = 2 | k = 0 | k = 1 | k = 2 | k = 3 |
| i = 0 | i = 1 | i = 1 | i = 2 | i = 2 | i = 2 | i = 3 | i = 3 | i = 3 | i = 3 |
| j = 0 | j = 0 | j = 1 | j = 0 | j = 1 | j = 2 | j = 0 | j = 1 | j = 2 | j = 3 |
_________________________________________________________________________________