在CUDA中查找第一个非零元素_Cuda_Gpu

在CUDA中查找第一个非零元素

cuda

在CUDA中查找第一个非零元素,cuda,gpu,Cuda,Gpu,共享内存中有一个长度为N的数组。数组是稀疏的（有很多零元素）目标是块中的所有线程（比如128个线程）都应该找到第一个索引j，其中N>j>=a，arr[j]为非零。显而易见的方法是： for(int i = a; i < N; i++){ if(!arr[i]){ j = i; break; } } for（int i=a；i

共享内存中有一个长度为N的数组。数组是稀疏的（有很多零元素）

目标是块中的所有线程（比如128个线程）都应该找到第一个索引j，其中N>j>=a，arr[j]为非零。显而易见的方法是：

for(int i = a; i < N; i++){
  if(!arr[i]){
    j = i;
    break;
  }
}

for（int i=a；i


然而，这种方法是不并行的（线程不合作），并且存在严重的库冲突。我想知道是否有更有效的方法来完成这项工作。
我不明白这里怎么会有银行冲突，你是在一个线程中完成所有工作的。我的解决方案是按块大小跨步，如果找到索引，则设置一个标志
__shared__ int arr[N];
__shared__ bool index_found = false;
__shared__ int current_found[128];  //assuming block size is 128. Assuming this is initialized to all falses.
int block_size = blockDim.x;
int thread_id = threadIdx.x;
for(int i = thread_id; (i < N) && !index_found ; i += block_size){ //stride by block_size
    if(arr[i] && (i >= a)){
        current_found[thread_id] = i;  // this only marks, at which iteration the value is found. But doesn't guarantee that there is a single index found. We need to find the absolute solution among possible indices.
        index_found = true;
    }
    __syncthreads();  // make sure all threads wait for shared writes
}
if(thread_id ==0){  // in master thread, find the index among possible values
    //mandatory sequential part
    for(int i = 0; i < 128 ; i++){
        if(current_found[i]){
            // i is your index. Get your index here the way you want.
            break;
        }
    }
}

\uuuuu共享\uuuuuu int arr[N]；
__shared\uuubool index\u found=false；
__共享_uuu int当前_u找到[128]//假设块大小为128。假设这已初始化为所有FALSE。
int block_size=blockDim.x；
int thread_id=threadIdx.x；
对于（inti=thread\u id；（i=a））{
current_found[thread_id]=i；//这只标记在哪个迭代中找到值。但不能保证找到单个索引。我们需要在可能的索引中找到绝对解。
索引_found=true；
}
__syncthreads（）；//确保所有线程等待共享写入
}
如果主线程中的（thread_id==0）{//，请在可能的值中查找索引
//强制顺序部分
对于（int i=0；i<128；i++）{
如果（当前找到的\u[i]）{
//我是你的索引。按你想要的方式在这里获取索引。
打破
}
}
}

代码未在web编辑器中测试和编写。
这是一个问题。通过适当的谓词测试，基本上可以找到最小值。我将编写一些代码来演示。为了使事情简单，我将做一些简化的假设，例如我们想要做一个简单的共享内存缩减（与warp shuffle相比）：
const int nTPB=128；
//假设nTPB（每个块的线程数）是2的幂
...
__共享__u;int arr[N]；
__共享_uu_uu_uu_u_u_u_u_u_u_u_u_u_u_u_u_uu_u_u_u_；
//填充arr的代码。。。
// ...
红色[threadIdx.x]=N；
__同步线程（）；
//执行测试
对于（int-ridx=threadIdx.x+a；ridx>1；ridx>0；ridx>>=1）{
如果（threadIdx.x

下面是一个快速测试用例：
$ cat t1640.cu
#include <iostream>

const int nTPB = 128;
const int N = 1045;
const int a = 23;
// assume nTPB, number of threads per block, is a power of 2
__global__ void k(int *d, int *r){
  __shared__ int arr[N];
  __shared__ int red[nTPB];
// code which populates arr ...
  for (int ridx = threadIdx.x; ridx < N; ridx += nTPB) // block-stride loop
    arr[ridx] = d[ridx];
  red[threadIdx.x] =  N;
  __syncthreads();
// perform test
  for (int ridx = threadIdx.x+a; ridx < N; ridx += nTPB){ // block-stride loop
    int t1 = arr[ridx]?ridx:N;
    red[threadIdx.x] = min(t1, red[threadIdx.x]);}
  __syncthreads();
// standard min-finding sweep reduction in shared memory
  for (int ridx = nTPB>>1; ridx > 0; ridx>>=1){
    if (threadIdx.x < ridx) red[threadIdx.x] = min(red[threadIdx.x], red[threadIdx.x+ridx]);
    __syncthreads();}
// result is now in red[0]
// a result value of N indicates that all values in range [a, N) were 0
  if (!threadIdx.x) *r = red[0];
}

int main(){

  int *h_d, *d_d, *h_r, *d_r;
  h_d = new int[N];
  h_r = new int[1];
  cudaMalloc(&d_d, N*sizeof(d_d[0]));
  cudaMalloc(&d_r, sizeof(d_r[0]));
  for (int i = 0; i < N; i++) h_d[i] = 0;
  h_d[44] = 1;
  cudaMemcpy(d_d, h_d, N*sizeof(d_d[0]), cudaMemcpyHostToDevice);
  k<<<1, nTPB>>>(d_d, d_r);
  cudaMemcpy(h_r, d_r, sizeof(d_r[0]), cudaMemcpyDeviceToHost);
  std::cout << h_r[0] << std::endl;
}
$ nvcc -o t1640 t1640.cu
$ cuda-memcheck ./t1640
========= CUDA-MEMCHECK
44
========= ERROR SUMMARY: 0 errors
$

$cat t1640.cu
#包括
常数int nTPB=128；
常数int N=1045；
常数INTA=23；
//假设nTPB（每个块的线程数）是2的幂
__全局无效k（int*d，int*r）{
__共享__u;int arr[N]；
__共享_uu_uu_uu_u_u_u_u_u_u_u_u_u_u_u_u_uu_u_u_u；
//填充arr的代码。。。
对于（int-ridx=threadIdx.x；ridx>1；ridx>0；ridx>>=1）{
如果（threadIdx.xstd:：cout我们的答案是相似的。主要区别在于，您称之为“强制顺序部分”的事情不需要按照您所指示的顺序进行。您可以在一个典型的共享内存并行缩减中集体使用所有线程，以找到（最低的）索引。如果a
为0，并且实际的第一个非零值出现在索引i=0
处，那么您的代码也有一个错误。是的，序列部分也可以通过减少来并行化。我不明白为什么会有错误。当i>=a
时，我正在标记找到的当前值。如果i
为零会怎么样（找到非零值时）？然后current\u-found[thread\u-id]
保持零，如果（current\u-found[i]）
不做您想做的事，则保持零。好的，现在我明白了。谢谢：）是的，您的第一个循环也可以通过从i=thread\u id+a
开始而不是从i=thread\u id开始进行优化。在您建议提前退出的情况下，这会优化平均执行时间，但会使最坏情况下的执行时间更长。
const int nTPB = 128;
// assume nTPB, number of threads per block, is a power of 2
...
__shared__ int arr[N];
__shared__ int red[nTPB];
// code which populates arr ...
// ...
red[threadIdx.x] =  N;
__syncthreads();
// perform test
for (int ridx = threadIdx.x+a; ridx < N; ridx += nTPB){ // block-stride loop
  int t1 = arr[ridx]?ridx:N;
  red[threadIdx.x] = min(t1, red[threadIdx.x]);}
__syncthreads();
// standard min-finding sweep reduction in shared memory
for (int ridx = nTPB>>1; ridx > 0; ridx>>=1){
  if (threadIdx.x < ridx) red[threadIdx.x] = min(red[threadIdx.x], red[threadIdx.x+ridx]);
  __syncthreads();}
// result is now in red[0]
// a result value of N indicates that all values in range [a, N) were 0

$ cat t1640.cu
#include <iostream>

const int nTPB = 128;
const int N = 1045;
const int a = 23;
// assume nTPB, number of threads per block, is a power of 2
__global__ void k(int *d, int *r){
  __shared__ int arr[N];
  __shared__ int red[nTPB];
// code which populates arr ...
  for (int ridx = threadIdx.x; ridx < N; ridx += nTPB) // block-stride loop
    arr[ridx] = d[ridx];
  red[threadIdx.x] =  N;
  __syncthreads();
// perform test
  for (int ridx = threadIdx.x+a; ridx < N; ridx += nTPB){ // block-stride loop
    int t1 = arr[ridx]?ridx:N;
    red[threadIdx.x] = min(t1, red[threadIdx.x]);}
  __syncthreads();
// standard min-finding sweep reduction in shared memory
  for (int ridx = nTPB>>1; ridx > 0; ridx>>=1){
    if (threadIdx.x < ridx) red[threadIdx.x] = min(red[threadIdx.x], red[threadIdx.x+ridx]);
    __syncthreads();}
// result is now in red[0]
// a result value of N indicates that all values in range [a, N) were 0
  if (!threadIdx.x) *r = red[0];
}

int main(){

  int *h_d, *d_d, *h_r, *d_r;
  h_d = new int[N];
  h_r = new int[1];
  cudaMalloc(&d_d, N*sizeof(d_d[0]));
  cudaMalloc(&d_r, sizeof(d_r[0]));
  for (int i = 0; i < N; i++) h_d[i] = 0;
  h_d[44] = 1;
  cudaMemcpy(d_d, h_d, N*sizeof(d_d[0]), cudaMemcpyHostToDevice);
  k<<<1, nTPB>>>(d_d, d_r);
  cudaMemcpy(h_r, d_r, sizeof(d_r[0]), cudaMemcpyDeviceToHost);
  std::cout << h_r[0] << std::endl;
}
$ nvcc -o t1640 t1640.cu
$ cuda-memcheck ./t1640
========= CUDA-MEMCHECK
44
========= ERROR SUMMARY: 0 errors
$