如何消除CUDA中的连续段

如何消除CUDA中的连续段,cuda,gpu,gpgpu,nvidia,Cuda,Gpu,Gpgpu,Nvidia,我的内核中有一个连续的部分,它确实在减慢速度。然而,我不知道如何摆脱内部循环。有什么建议吗 __global__ void myKernel( int keep, int inc, int width, int* d_Xnum, int* d_Xco, bool* d_Xvalid,int* d_A ) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx

我的内核中有一个连续的部分,它确实在减慢速度。然而,我不知道如何摆脱内部循环。有什么建议吗

__global__ void myKernel( int keep, int inc, int width, int* d_Xnum,
 int* d_Xco, bool* d_Xvalid,int* d_A )
{
  int i = blockIdx.x * blockDim.x + threadIdx.x;
  int j = blockIdx.y * blockDim.y + threadIdx.y;

  int k1;

  if( i < keep && j <= i){
    int counter = 0;

    for(k1 = 0; k1 < inc; k1++){
      if(d_Xvalid[j*inc + k1] == 0)
         counter += (d_Xvalid[i*inc + d_Xco[j*width + k1]]);
    }

    d_A[i*keep+j] = inc - d_Xnum[i] - counter;
  }
}

keep
约为9000,而
inc
约为20000

这并不是您问题的确切答案,但它可能可以优化您的代码,并可能有助于您对
k1
求和进行并行缩减,因为您可以去掉
if(imyKernel(…)
Done!从数学上讲,它非常聪明,但我不明白为什么启动更多线程会比代码慢…@Manolete:它已经工作了吗?线程以扭曲的形式发送到gpu(目前每个扭曲有32个线程)。当线程没有执行任何操作时,它们仍然必须等待其扭曲中的线程完成该操作,从而占据可以计算所需内容的线程的位置。大多数情况下(尽管并非总是如此)许多非活动线程意味着较慢的代码@JeanMoniuc:它可以工作,但有时会给出错误的结果。我们是否遗漏了一些东西?我已经对它进行了少量调试,它工作正常,但我不明白为什么它会随机失败。。。
         ...
  int t = 32;
  int b = keep/(32)+1;
  int b2 = (inc/32)+1;
  dim3 thread (t, t);
  dim3 block (b, inc);

  // kernel call
  myKernel<<<block, thread>>>(k, inc, width, d_Xnum,
                  d_Xco, d_Xvalid, d_A);
  cudaThreadSynchronize();
            ...
__global__ void myKernel( int keep, int inc, int width, int* d_Xnum,
int* d_Xco, bool* d_Xvalid,int* d_A )
{
  int k = blockIdx.x * blockDim.x + threadIdx.x;
  int i = (int)(sqrt(0.25+2.0*k)-0.5); 
  int j = k - i*(i+1)/2;

  int k1;
  if( i < keep && j < inc){
    int counter = 0;

    for(k1 = 0; k1 < inc; k1++){
      if(d_Xvalid[j*inc + k1] == 0)
         counter += (d_Xvalid[i*inc + d_Xco[j*width + k1]]);
    }

    d_A[i*keep+j] = inc - d_Xnum[i] - counter;
  }
}
_________________________________
| i = 0 | i = 0 | i = 0 | i = 0 |
| j = 0 |   -   |   -   |   -   |
_________________________________
| i = 1 | i = 1 | i = 1 | i = 1 |
| j = 0 | j = 1 |   -   |   -   |
_________________________________
| i = 2 | i = 2 | i = 2 | i = 2 |
| j = 0 | j = 1 | j = 2 |   -   |
_________________________________
| i = 3 | i = 3 | i = 3 | i = 3 |
| j = 0 | j = 1 | j = 2 | j = 3 |
_________________________________
_________________________________________________________________________________
| k = 0 | k = 0 | k = 1 | k = 0 | k = 1 | k = 2 | k = 0 | k = 1 | k = 2 | k = 3 |
| i = 0 | i = 1 | i = 1 | i = 2 | i = 2 | i = 2 | i = 3 | i = 3 | i = 3 | i = 3 |
| j = 0 | j = 0 | j = 1 | j = 0 | j = 1 | j = 2 | j = 0 | j = 1 | j = 2 | j = 3 |
_________________________________________________________________________________