最小还原cuda不工作

最小还原cuda不工作,cuda,Cuda,我写了一个代码,通过减少来找到最小值。然而,结果总是零。我不知道是什么问题。请帮帮我 这里是内核代码:我修改了英伟达的和减少代码。 #include <limits.h> #define NumThread 128 #define NumBlock 32 __global__ void min_reduce(int* In, int* Out, int n){ __shared__ int sdata[NumThread]; unsigned int i = blockI

我写了一个代码,通过减少来找到最小值。然而,结果总是零。我不知道是什么问题。请帮帮我

这里是内核代码:我修改了英伟达的和减少代码。

#include <limits.h>

#define NumThread 128
#define NumBlock 32

__global__ void min_reduce(int* In, int* Out, int n){
  __shared__ int sdata[NumThread];
  unsigned int i = blockIdx.x * NumThread + threadIdx.x;
  unsigned int tid = threadIdx.x;
  unsigned int gridSize = NumBlock * NumThread;
  int myMin = INT_MAX;

  while (i < n){
    if(In[i] < myMin)
    myMin = In[i];
    i += gridSize;
  }
  sdata[tid] = myMin;
  __syncthreads();

  if (NumThread >= 1024){
    if (tid < 512)
    if(sdata[tid] > sdata[tid + 512] ) sdata[tid] = sdata[tid + 512];
    __syncthreads();
  }
  if (NumThread >= 512){
    if(sdata[tid] > sdata[tid + 256] ) sdata[tid] = sdata[tid + 256];
    __syncthreads();
  }
  if (NumThread >= 256){
    if(sdata[tid] > sdata[tid + 128] && sdata[tid + 128] !=0) sdata[tid] =  sdata[tid + 128];
    __syncthreads();
  }
  if (NumThread >= 128){
    if(sdata[tid] > sdata[tid + 64] ) sdata[tid] =    sdata[tid + 64];
    __syncthreads();
  }
  //the following practice is deprecated
   if (tid < 32){
    volatile int *smem = sdata;
    if (NumThread >= 64) if(smem[tid] > smem[tid + 32] ) smem[tid] =  smem[tid+32];
    if (NumThread >= 32) if(smem[tid] > smem[tid + 16]) smem[tid] =  smem[tid+16];
    if (NumThread >= 16) if(smem[tid] > smem[tid + 8]) smem[tid] =  smem[tid+8];
    if (NumThread >= 8) if(smem[tid] > smem[tid + 4] ) smem[tid] =  smem[tid+4];
    if (NumThread >= 4) if(smem[tid] > smem[tid + 2] ) smem[tid] =  smem[tid+2];
    if (NumThread >= 2) if(smem[tid] > smem[tid + 1] )      smem[tid] =  smem[tid+1];
  }
  if (tid == 0)
    if(sdata[0] < sdata[1] ) Out[blockIdx.x] = sdata[0];
    else Out[blockIdx.x] = sdata[1];      
}
#包括
#定义numthread128
#定义锁32
__全局无效最小减少值(int*In,int*Out,int n){
__共享的_uu_uu_uu_uu_u_u_u_u_u_u_u_u_uu_uu_uu_uu_uu_u;
unsigned int i=blockIdx.x*NumThread+threadIdx.x;
unsigned int tid=threadIdx.x;
unsigned int gridSize=NumBlock*NumThread;
int myMin=int_MAX;
而(i=1024){
如果(tid<512)
如果(sdata[tid]>sdata[tid+512])sdata[tid]=sdata[tid+512];
__同步线程();
}
如果(NumThread>=512){
如果(sdata[tid]>sdata[tid+256])sdata[tid]=sdata[tid+256];
__同步线程();
}
如果(NumThread>=256){
如果(sdata[tid]>sdata[tid+128]&&sdata[tid+128]!=0)sdata[tid]=sdata[tid+128];
__同步线程();
}
如果(NumThread>=128){
如果(sdata[tid]>sdata[tid+64])sdata[tid]=sdata[tid+64];
__同步线程();
}
//不推荐使用以下做法
如果(tid<32){
volatile int*smem=sdata;
if(NumThread>=64)if(smem[tid]>smem[tid+32])smem[tid]=smem[tid+32];
if(NumThread>=32)if(smem[tid]>smem[tid+16])smem[tid]=smem[tid+16];
if(NumThread>=16)if(smem[tid]>smem[tid+8])smem[tid]=smem[tid+8];
if(NumThread>=8)if(smem[tid]>smem[tid+4])smem[tid]=smem[tid+4];
if(NumThread>=4)if(smem[tid]>smem[tid+2])smem[tid]=smem[tid+2];
if(NumThread>=2)if(smem[tid]>smem[tid+1])smem[tid]=smem[tid+1];
}
如果(tid==0)
if(sdata[0]
这是我的主要代码:

#include <stdio.h>
#include <stdlib.h>

#include "min_reduction.cu"

int main(int argc, char* argv[]){
  unsigned int length = 1048576;
  int i, Size, min;
  int *a, *out, *gpuA, *gpuOut;

  cudaSetDevice(0);
  Size = length * sizeof(int);
  a = (int*)malloc(Size);
  out = (int*)malloc(NumBlock*sizeof(int));
  for(i=0;i<length;i++) a[i] = (i + 10);

  cudaMalloc((void**)&gpuA,Size);
  cudaMalloc((void**)&gpuOut,NumBlock*sizeof(int));
  cudaMemcpy(gpuA,a,Size,cudaMemcpyHostToDevice);
  min_reduce<<<NumBlock,NumThread>>>(gpuA,gpuOut,length);
  cudaDeviceSynchronize();
  cudaMemcpy(out,gpuOut,NumBlock*sizeof(int),cudaMemcpyDeviceToHost);

  min = out[0];
  for(i=1;i<NumBlock;i++) if(min < out[i]) min = out[i];
  return 0;
}
#包括
#包括
#包括“min_reduction.cu”
int main(int argc,char*argv[]){
无符号整数长度=1048576;
int i,大小,最小值;
int*a、*out、*gpuA、*gpuOut;
cudaSetDevice(0);
尺寸=长度*sizeof(整数);
a=(int*)malloc(大小);
out=(int*)malloc(NumBlock*sizeof(int));

对于(i=0;i我不确定我是否同意@HubertApplebaum所说的一切,但我可以同意使用的建议。正如您在代码中提到的,warp同步编程可能被认为是不推荐的,但我不能支持它被破坏的说法(目前).不过我不想就此争论,这不是你问题的核心

另一个有用的调试建议是,按照以下步骤使用
-lineinfo
编译代码,并使用
cuda memcheck
运行代码

========= Invalid __shared__ read of size 4
=========     at 0x000001e0 in /home/bob/misc/t1074.cu:39:min_reduce(int*, int*, int)
=========     by thread (64,0,0) in block (24,0,0)
=========     Address 0x00000200 is out of bounds
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2cd) [0x15859d]
=========     Host Frame:./t1074 [0x16dc1]
=========     Host Frame:./t1074 [0x315d3]
=========     Host Frame:./t1074 [0x28f5]
=========     Host Frame:./t1074 [0x2623]
=========     Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21d65]
=========     Host Frame:./t1074 [0x271d]
这表明代码中的一个主要问题是,您错误地索引到了
\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu>内存数组以及发生这种情况的特定代码行.Neat!(在我的情况下是第39。然后,如果深入到该行,您将希望学习这部分代码:

  #define NumThread 128
  ...
  __shared__ int sdata[NumThread];
  ...
  if (NumThread >= 128){
    if(sdata[tid] > sdata[tid + 64] ) sdata[tid] =    sdata[tid + 64]; //line 39 in my case
    __syncthreads();
  }
您已经在128处定义了
NumThread
,并且静态分配了一个包含那么多
int
数量的共享内存数组。一切都很好。if语句中的代码呢?if条件将得到满足,这意味着块中的所有128个线程都将执行if语句的主体。但是,您正在从共享内存中读取
sdata[tid+64]
,对于
tid
大于63的线程(即每个块中的一半线程),这将在共享内存中生成大于127的索引(这是越界的,即非法的)

修复(对于您显示的特定代码)相当简单,只需添加另一个if测试:

  if (NumThread >= 128){
    if (tid < 64)
      if(sdata[tid] > sdata[tid + 64] ) sdata[tid] =    sdata[tid + 64];
    __syncthreads();
  }
如果您想找到最小值,并仔细考虑该逻辑,您会意识到您应该这样做:

  for(i=1;i<NumBlock;i++) if(min > out[i]) min = out[i];
                                 ^
                                 |
                              greater than
for(i=1;i out[i])min=out[i];
^
|
大于
通过这两个更改,您的代码为我生成了正确的结果:

$ cat t1074.cu
#include <stdio.h>
#include <stdlib.h>


#include <limits.h>

#define NumThread 128
#define NumBlock 32

__global__ void min_reduce(int* In, int* Out, int n){
  __shared__ int sdata[NumThread];
  unsigned int i = blockIdx.x * NumThread + threadIdx.x;
  unsigned int tid = threadIdx.x;
  unsigned int gridSize = NumBlock * NumThread;
  int myMin = INT_MAX;

  while (i < n){
    if(In[i] < myMin)
    myMin = In[i];
    i += gridSize;
  }
  sdata[tid] = myMin;
  __syncthreads();

  if (NumThread >= 1024){
    if (tid < 512)
    if(sdata[tid] > sdata[tid + 512] ) sdata[tid] = sdata[tid + 512];
    __syncthreads();
  }
  if (NumThread >= 512){
    if(sdata[tid] > sdata[tid + 256] ) sdata[tid] = sdata[tid + 256];
    __syncthreads();
  }
  if (NumThread >= 256){
    if(sdata[tid] > sdata[tid + 128] && sdata[tid + 128] !=0) sdata[tid] =  sdata[tid + 128];
    __syncthreads();
  }
  if (NumThread >= 128){
    if (tid < 64)
    if(sdata[tid] > sdata[tid + 64] ) sdata[tid] =    sdata[tid + 64];
    __syncthreads();
  }
  //the following practice is deprecated
   if (tid < 32){
    volatile int *smem = sdata;
    if (NumThread >= 64) if(smem[tid] > smem[tid + 32] ) smem[tid] =  smem[tid+32];
    if (NumThread >= 32) if(smem[tid] > smem[tid + 16]) smem[tid] =  smem[tid+16];
    if (NumThread >= 16) if(smem[tid] > smem[tid + 8]) smem[tid] =  smem[tid+8];
    if (NumThread >= 8) if(smem[tid] > smem[tid + 4] ) smem[tid] =  smem[tid+4];
    if (NumThread >= 4) if(smem[tid] > smem[tid + 2] ) smem[tid] =  smem[tid+2];
    if (NumThread >= 2) if(smem[tid] > smem[tid + 1] )      smem[tid] =  smem[tid+1];
  }
  if (tid == 0)
    if(sdata[0] < sdata[1] ) Out[blockIdx.x] = sdata[0];
    else Out[blockIdx.x] = sdata[1];
}

int main(int argc, char* argv[]){
  unsigned int length = 1048576;
  int i, Size, min;
  int *a, *out, *gpuA, *gpuOut;

  cudaSetDevice(0);
  Size = length * sizeof(int);
  a = (int*)malloc(Size);
  out = (int*)malloc(NumBlock*sizeof(int));
  for(i=0;i<length;i++) a[i] = (i + 10);
  a[10]=5;
  cudaMalloc((void**)&gpuA,Size);
  cudaMalloc((void**)&gpuOut,NumBlock*sizeof(int));
  cudaMemcpy(gpuA,a,Size,cudaMemcpyHostToDevice);
  min_reduce<<<NumBlock,NumThread>>>(gpuA,gpuOut,length);
  cudaDeviceSynchronize();
  cudaMemcpy(out,gpuOut,NumBlock*sizeof(int),cudaMemcpyDeviceToHost);

  min = out[0];
  for(i=1;i<NumBlock;i++) if(min > out[i]) min = out[i];
  printf("min = %d\n", min);
  return 0;
}
$ nvcc -o t1074 t1074.cu
$ cuda-memcheck ./t1074
========= CUDA-MEMCHECK
min = 5
========= ERROR SUMMARY: 0 errors
$
$cat t1074.cu
#包括
#包括
#包括
#定义numthread128
#定义锁32
__全局无效最小减少值(int*In,int*Out,int n){
__共享的_uu_uu_uu_uu_u_u_u_u_u_u_u_u_uu_uu_uu_uu_uu_u;
unsigned int i=blockIdx.x*NumThread+threadIdx.x;
unsigned int tid=threadIdx.x;
unsigned int gridSize=NumBlock*NumThread;
int myMin=int_MAX;
而(i=1024){
如果(tid<512)
如果(sdata[tid]>sdata[tid+512])sdata[tid]=sdata[tid+512];
__同步线程();
}
如果(NumThread>=512){
如果(sdata[tid]>sdata[tid+256])sdata[tid]=sdata[tid+256];
__同步线程();
}
如果(NumThread>=256){
如果(sdata[tid]>sdata[tid+128]&&sdata[tid+128]!=0)sdata[tid]=sdata[tid+128];
__同步线程();
}
如果(NumThread>=128){
如果(tid<64)
如果(sdata[tid]>sdata[tid+64])sdata[tid]=sdata[tid+64];
__同步线程();
}
//不推荐使用以下做法
如果(tid<32){
volatile int*smem=sdata;
if(NumThread>=64)if(smem[tid]>smem[tid+32])smem[tid]=smem[tid+32];
if(NumThread>=32)if(smem[tid]>smem[tid+16])smem[tid]=smem[tid+16];
if(NumThread>=16)if(smem[tid]>smem[tid+8])smem[tid]=smem[tid+8];
if(NumThread>=8)if(smem[tid]>smem[tid+4])smem[tid]=smem[tid+4];
if(NumThread>=4)if(smem[tid]>smem[tid+2])smem[tid]=smem[tid+2];
if(NumThread>=2)if(smem[tid]>smem[tid+1])smem[tid]=smem[tid+1];
}
如果(tid==0)
如果(sdat)
$ cat t1074.cu
#include <stdio.h>
#include <stdlib.h>


#include <limits.h>

#define NumThread 128
#define NumBlock 32

__global__ void min_reduce(int* In, int* Out, int n){
  __shared__ int sdata[NumThread];
  unsigned int i = blockIdx.x * NumThread + threadIdx.x;
  unsigned int tid = threadIdx.x;
  unsigned int gridSize = NumBlock * NumThread;
  int myMin = INT_MAX;

  while (i < n){
    if(In[i] < myMin)
    myMin = In[i];
    i += gridSize;
  }
  sdata[tid] = myMin;
  __syncthreads();

  if (NumThread >= 1024){
    if (tid < 512)
    if(sdata[tid] > sdata[tid + 512] ) sdata[tid] = sdata[tid + 512];
    __syncthreads();
  }
  if (NumThread >= 512){
    if(sdata[tid] > sdata[tid + 256] ) sdata[tid] = sdata[tid + 256];
    __syncthreads();
  }
  if (NumThread >= 256){
    if(sdata[tid] > sdata[tid + 128] && sdata[tid + 128] !=0) sdata[tid] =  sdata[tid + 128];
    __syncthreads();
  }
  if (NumThread >= 128){
    if (tid < 64)
    if(sdata[tid] > sdata[tid + 64] ) sdata[tid] =    sdata[tid + 64];
    __syncthreads();
  }
  //the following practice is deprecated
   if (tid < 32){
    volatile int *smem = sdata;
    if (NumThread >= 64) if(smem[tid] > smem[tid + 32] ) smem[tid] =  smem[tid+32];
    if (NumThread >= 32) if(smem[tid] > smem[tid + 16]) smem[tid] =  smem[tid+16];
    if (NumThread >= 16) if(smem[tid] > smem[tid + 8]) smem[tid] =  smem[tid+8];
    if (NumThread >= 8) if(smem[tid] > smem[tid + 4] ) smem[tid] =  smem[tid+4];
    if (NumThread >= 4) if(smem[tid] > smem[tid + 2] ) smem[tid] =  smem[tid+2];
    if (NumThread >= 2) if(smem[tid] > smem[tid + 1] )      smem[tid] =  smem[tid+1];
  }
  if (tid == 0)
    if(sdata[0] < sdata[1] ) Out[blockIdx.x] = sdata[0];
    else Out[blockIdx.x] = sdata[1];
}

int main(int argc, char* argv[]){
  unsigned int length = 1048576;
  int i, Size, min;
  int *a, *out, *gpuA, *gpuOut;

  cudaSetDevice(0);
  Size = length * sizeof(int);
  a = (int*)malloc(Size);
  out = (int*)malloc(NumBlock*sizeof(int));
  for(i=0;i<length;i++) a[i] = (i + 10);
  a[10]=5;
  cudaMalloc((void**)&gpuA,Size);
  cudaMalloc((void**)&gpuOut,NumBlock*sizeof(int));
  cudaMemcpy(gpuA,a,Size,cudaMemcpyHostToDevice);
  min_reduce<<<NumBlock,NumThread>>>(gpuA,gpuOut,length);
  cudaDeviceSynchronize();
  cudaMemcpy(out,gpuOut,NumBlock*sizeof(int),cudaMemcpyDeviceToHost);

  min = out[0];
  for(i=1;i<NumBlock;i++) if(min > out[i]) min = out[i];
  printf("min = %d\n", min);
  return 0;
}
$ nvcc -o t1074 t1074.cu
$ cuda-memcheck ./t1074
========= CUDA-MEMCHECK
min = 5
========= ERROR SUMMARY: 0 errors
$