Warning: file_get_contents(/data/phpspider/zhask/data//catemap/8/python-3.x/16.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
在CUDA中查找第一个非零元素_Cuda_Gpu - Fatal编程技术网

在CUDA中查找第一个非零元素

在CUDA中查找第一个非零元素,cuda,gpu,Cuda,Gpu,共享内存中有一个长度为N的数组。数组是稀疏的(有很多零元素) 目标是块中的所有线程(比如128个线程)都应该找到第一个索引j,其中N>j>=a,arr[j]为非零。显而易见的方法是: for(int i = a; i < N; i++){ if(!arr[i]){ j = i; break; } } for(int i=a;i

共享内存中有一个长度为N的数组。数组是稀疏的(有很多零元素)

目标是块中的所有线程(比如128个线程)都应该找到第一个索引j,其中N>j>=a,arr[j]为非零。显而易见的方法是:

for(int i = a; i < N; i++){
  if(!arr[i]){
    j = i;
    break;
  }
}
for(int i=a;i

然而,这种方法是不并行的(线程不合作),并且存在严重的库冲突。我想知道是否有更有效的方法来完成这项工作。

我不明白这里怎么会有银行冲突,你是在一个线程中完成所有工作的。我的解决方案是按块大小跨步,如果找到索引,则设置一个标志

__shared__ int arr[N];
__shared__ bool index_found = false;
__shared__ int current_found[128];  //assuming block size is 128. Assuming this is initialized to all falses.
int block_size = blockDim.x;
int thread_id = threadIdx.x;
for(int i = thread_id; (i < N) && !index_found ; i += block_size){ //stride by block_size
    if(arr[i] && (i >= a)){
        current_found[thread_id] = i;  // this only marks, at which iteration the value is found. But doesn't guarantee that there is a single index found. We need to find the absolute solution among possible indices.
        index_found = true;
    }
    __syncthreads();  // make sure all threads wait for shared writes
}
if(thread_id ==0){  // in master thread, find the index among possible values
    //mandatory sequential part
    for(int i = 0; i < 128 ; i++){
        if(current_found[i]){
            // i is your index. Get your index here the way you want.
            break;
        }
    }
}
\uuuuu共享\uuuuuu int arr[N];
__shared\uuubool index\u found=false;
__共享_uuu int当前_u找到[128]//假设块大小为128。假设这已初始化为所有FALSE。
int block_size=blockDim.x;
int thread_id=threadIdx.x;
对于(inti=thread\u id;(i=a)){
current_found[thread_id]=i;//这只标记在哪个迭代中找到值。但不能保证找到单个索引。我们需要在可能的索引中找到绝对解。
索引_found=true;
}
__syncthreads();//确保所有线程等待共享写入
}
如果主线程中的(thread_id==0){//,请在可能的值中查找索引
//强制顺序部分
对于(int i=0;i<128;i++){
如果(当前找到的\u[i]){
//我是你的索引。按你想要的方式在这里获取索引。
打破
}
}
}
代码未在web编辑器中测试和编写。

这是一个问题。通过适当的谓词测试,基本上可以找到最小值。我将编写一些代码来演示。为了使事情简单,我将做一些简化的假设,例如我们想要做一个简单的共享内存缩减(与warp shuffle相比):

const int nTPB=128;
//假设nTPB(每个块的线程数)是2的幂
...
__共享__u;int arr[N];
__共享_uu_uu_uu_u_u_u_u_u_u_u_u_u_u_u_u_uu_u_u_u_;
//填充arr的代码。。。
// ...
红色[threadIdx.x]=N;
__同步线程();
//执行测试
对于(int-ridx=threadIdx.x+a;ridx>1;ridx>0;ridx>>=1){
如果(threadIdx.x
下面是一个快速测试用例:

$ cat t1640.cu
#include <iostream>

const int nTPB = 128;
const int N = 1045;
const int a = 23;
// assume nTPB, number of threads per block, is a power of 2
__global__ void k(int *d, int *r){
  __shared__ int arr[N];
  __shared__ int red[nTPB];
// code which populates arr ...
  for (int ridx = threadIdx.x; ridx < N; ridx += nTPB) // block-stride loop
    arr[ridx] = d[ridx];
  red[threadIdx.x] =  N;
  __syncthreads();
// perform test
  for (int ridx = threadIdx.x+a; ridx < N; ridx += nTPB){ // block-stride loop
    int t1 = arr[ridx]?ridx:N;
    red[threadIdx.x] = min(t1, red[threadIdx.x]);}
  __syncthreads();
// standard min-finding sweep reduction in shared memory
  for (int ridx = nTPB>>1; ridx > 0; ridx>>=1){
    if (threadIdx.x < ridx) red[threadIdx.x] = min(red[threadIdx.x], red[threadIdx.x+ridx]);
    __syncthreads();}
// result is now in red[0]
// a result value of N indicates that all values in range [a, N) were 0
  if (!threadIdx.x) *r = red[0];
}

int main(){

  int *h_d, *d_d, *h_r, *d_r;
  h_d = new int[N];
  h_r = new int[1];
  cudaMalloc(&d_d, N*sizeof(d_d[0]));
  cudaMalloc(&d_r, sizeof(d_r[0]));
  for (int i = 0; i < N; i++) h_d[i] = 0;
  h_d[44] = 1;
  cudaMemcpy(d_d, h_d, N*sizeof(d_d[0]), cudaMemcpyHostToDevice);
  k<<<1, nTPB>>>(d_d, d_r);
  cudaMemcpy(h_r, d_r, sizeof(d_r[0]), cudaMemcpyDeviceToHost);
  std::cout << h_r[0] << std::endl;
}
$ nvcc -o t1640 t1640.cu
$ cuda-memcheck ./t1640
========= CUDA-MEMCHECK
44
========= ERROR SUMMARY: 0 errors
$
$cat t1640.cu
#包括
常数int nTPB=128;
常数int N=1045;
常数INTA=23;
//假设nTPB(每个块的线程数)是2的幂
__全局无效k(int*d,int*r){
__共享__u;int arr[N];
__共享_uu_uu_uu_u_u_u_u_u_u_u_u_u_u_u_u_uu_u_u_u;
//填充arr的代码。。。
对于(int-ridx=threadIdx.x;ridx>1;ridx>0;ridx>>=1){
如果(threadIdx.xstd::cout我们的答案是相似的。主要区别在于,您称之为“强制顺序部分”的事情不需要按照您所指示的顺序进行。您可以在一个典型的共享内存并行缩减中集体使用所有线程,以找到(最低的)索引。如果
a
为0,并且实际的第一个非零值出现在索引
i=0
处,那么您的代码也有一个错误。是的,序列部分也可以通过减少来并行化。我不明白为什么会有错误。当
i>=a
时,我正在标记找到的
当前值。如果
i
为零会怎么样(找到非零值时)?然后
current\u-found[thread\u-id]
保持零,如果(current\u-found[i])
不做您想做的事,则
保持零。好的,现在我明白了。谢谢:)是的,您的第一个循环也可以通过从
i=thread\u id+a
开始而不是从
i=thread\u id
开始进行优化。在您建议提前退出的情况下,这会优化平均执行时间,但会使最坏情况下的执行时间更长。
const int nTPB = 128;
// assume nTPB, number of threads per block, is a power of 2
...
__shared__ int arr[N];
__shared__ int red[nTPB];
// code which populates arr ...
// ...
red[threadIdx.x] =  N;
__syncthreads();
// perform test
for (int ridx = threadIdx.x+a; ridx < N; ridx += nTPB){ // block-stride loop
  int t1 = arr[ridx]?ridx:N;
  red[threadIdx.x] = min(t1, red[threadIdx.x]);}
__syncthreads();
// standard min-finding sweep reduction in shared memory
for (int ridx = nTPB>>1; ridx > 0; ridx>>=1){
  if (threadIdx.x < ridx) red[threadIdx.x] = min(red[threadIdx.x], red[threadIdx.x+ridx]);
  __syncthreads();}
// result is now in red[0]
// a result value of N indicates that all values in range [a, N) were 0
$ cat t1640.cu
#include <iostream>

const int nTPB = 128;
const int N = 1045;
const int a = 23;
// assume nTPB, number of threads per block, is a power of 2
__global__ void k(int *d, int *r){
  __shared__ int arr[N];
  __shared__ int red[nTPB];
// code which populates arr ...
  for (int ridx = threadIdx.x; ridx < N; ridx += nTPB) // block-stride loop
    arr[ridx] = d[ridx];
  red[threadIdx.x] =  N;
  __syncthreads();
// perform test
  for (int ridx = threadIdx.x+a; ridx < N; ridx += nTPB){ // block-stride loop
    int t1 = arr[ridx]?ridx:N;
    red[threadIdx.x] = min(t1, red[threadIdx.x]);}
  __syncthreads();
// standard min-finding sweep reduction in shared memory
  for (int ridx = nTPB>>1; ridx > 0; ridx>>=1){
    if (threadIdx.x < ridx) red[threadIdx.x] = min(red[threadIdx.x], red[threadIdx.x+ridx]);
    __syncthreads();}
// result is now in red[0]
// a result value of N indicates that all values in range [a, N) were 0
  if (!threadIdx.x) *r = red[0];
}

int main(){

  int *h_d, *d_d, *h_r, *d_r;
  h_d = new int[N];
  h_r = new int[1];
  cudaMalloc(&d_d, N*sizeof(d_d[0]));
  cudaMalloc(&d_r, sizeof(d_r[0]));
  for (int i = 0; i < N; i++) h_d[i] = 0;
  h_d[44] = 1;
  cudaMemcpy(d_d, h_d, N*sizeof(d_d[0]), cudaMemcpyHostToDevice);
  k<<<1, nTPB>>>(d_d, d_r);
  cudaMemcpy(h_r, d_r, sizeof(d_r[0]), cudaMemcpyDeviceToHost);
  std::cout << h_r[0] << std::endl;
}
$ nvcc -o t1640 t1640.cu
$ cuda-memcheck ./t1640
========= CUDA-MEMCHECK
44
========= ERROR SUMMARY: 0 errors
$