CUDA无效设备符号错误_Cuda_Nvcc

CUDA无效设备符号错误

cuda

CUDA无效设备符号错误,cuda,nvcc,Cuda,Nvcc,下面的代码编译得很好。但当我试着运行它时，我得到了 GPUassert: invalid device symbol file.cu 114 当我用（！！！）标记注释行时，错误不会显示。我的问题是，是什么导致了这个错误，因为它让我毫无意义使用nvcc file.cu编译-arch compute_11 #include "stdio.h" #include <algorithm> #include <ctime> #define gpuErrchk(ans) { g

下面的代码编译得很好。但当我试着运行它时，我得到了

GPUassert: invalid device symbol file.cu 114

当我用（！！！）标记注释行时，错误不会显示。我的问题是，是什么导致了这个错误，因为它让我毫无意义

使用nvcc file.cu编译-arch compute_11

#include "stdio.h"
#include <algorithm>
#include <ctime>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
#define THREADS 64
#define BLOCKS 256
#define _dif (((1ll<<32)-121)/(THREADS*BLOCKS)+1)

#define HASH_SIZE 1024
#define ROUNDS 16
#define HASH_ROW (HASH_SIZE/ROUNDS)+(HASH_SIZE%ROUNDS==0?0:1)
#define HASH_COL 1000000000/HASH_SIZE


typedef unsigned long long ull;

inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
  if (code != cudaSuccess) 
  {
  //fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
  printf("GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
  if (abort) exit(code);
  }
}

__device__ unsigned int primes[1024]; 
//__device__ unsigned char primes[(1<<28)+1];
__device__ long long n = 1ll<<32; 
__device__ ull dev_base;
__device__ unsigned int dev_hash; 
__device__ unsigned int dev_index; 

time_t curtime;

__device__ int hashh(long long x) {
  return (x>>1)%1024;
}
// compute (x^e)%n
__device__ ull mulmod(ull x,ull e,ull n) {
ull ans = 1;
while(e>0) {
    if(e&1) ans = (ans*x)%n;
    x = (x*x)%n;
    e>>=1;
}
return ans;
}

// determine whether n is strong probable prime base a or not.
// n is ODD
__device__ int is_SPRP(ull a,ull n) {
  int d=0;
  ull t = n-1;
  while(t%2==0) {
      ++d;
      t>>=1;
  }
  ull x = mulmod(a,t,n);
  if(x==1) return 1; 
  for(int i=0;i<d;++i) {
      if(x==n-1) return 1;
      x=(x*x)%n;
  }
  return 0;
}


__device__ int prime(long long x) {
//unsigned long long b = 2;
//return is_SPRP(b,(unsigned long long)x);
return is_SPRP((unsigned long long)primes[(((long long)0xAFF7B4*x)>>7)%1024],(unsigned long long)x);
}

__global__ void find(unsigned int *out,unsigned int *c) {

unsigned int buff[HASH_ROW][256];
int local_c[HASH_ROW];
for(int i=0;i<HASH_ROW;++i) local_c[i]=0;

long long b = 121+(threadIdx.x+blockIdx.x*blockDim.x)*_dif;
long long e = b+_dif;
if(b%2==0) ++b;
for(long long i=b;i<e && i<n;i+=2) {
    if(i%3==0 || i%5==0 || i%7==0) continue;
    int hash_num = hashh(i)-(dev_hash*(HASH_ROW));
    if(0<=hash_num && hash_num<HASH_ROW) {
    if(prime(i)) continue;
    buff[hash_num][local_c[hash_num]++]=(unsigned int)i;
    if(local_c[hash_num]==256) {
        int start = atomicAdd(c+hash_num,local_c[hash_num]);
        if(start+local_c[hash_num]>=HASH_COL) return;

        unsigned int *out_offset = out+hash_num*(HASH_COL)*4;
        for(int i=0;i<local_c[hash_num];++i) out_offset[i+start]=buff[hash_num][i]; //(!!!)
        local_c[hash_num]=0;
    }
    }
}
for(int i=0;i<HASH_ROW;++i) {
  int start = atomicAdd(c+i,local_c[i]);
  if(start+local_c[i]>=HASH_COL) return;
  unsigned int *out_offset = out+i*(HASH_COL)*4;
  for(int j=0;j<local_c[i];++j) out_offset[j+start]=buff[i][j]; //(!!!)
}

}

int main(void) {
printf("HASH_ROW: %d\nHASH_COL: %d\nPRODUCT: %d\n",(int)HASH_ROW,(int)HASH_COL,(int)(HASH_ROW)*(HASH_COL));

ull *base_adr;
gpuErrchk(cudaGetSymbolAddress((void**)&base_adr,dev_base));
gpuErrchk(cudaMemset(base_adr,0,7));
gpuErrchk(cudaMemset(base_adr,0x02,1));
}

#包括“stdio.h”
#包括
#包括
#定义gpuerchk（ans）{gpuAssert（（ans），_文件_，_行__）}
#定义线程64
#定义块256
#定义_dif（（1ll>=1；
}
ull x=mulmod（a，t，n）；
如果（x==1）返回1；
对于（int i=0；i>7）%1024]，（无符号长）x）；
}
__全局无效查找（无符号整数*out，无符号整数*c）{
无符号整型buff[HASH_ROW][256]；
int local_c[哈希_行]；
对于（inti=0；i而言，这是一个相当不寻常的错误
发生故障的原因是：

通过仅指定虚拟体系结构（-arch compute_11
），可以将PTX编译步骤推迟到运行时（即强制JIT编译）
JIT编译失败（在运行时）
JIT编译（和链接）失败意味着无法正确建立设备符号
由于设备符号的问题，设备符号dev_base
上的操作cudaGetSymbolAddress
失败，并引发错误

为什么JIT编译失败？您可以通过指定-arch=sm_11
而不是-arch compute_11
来触发机器代码编译（运行ptxas
汇编程序）来发现问题。如果这样做，您将得到以下结果：
ptxas error   : Entry function '_Z4findPjS_' uses too much local data (0x10100 bytes, 0x4000 max)

因此，即使您的代码没有调用find
内核，它也必须成功编译，才能为符号提供一个健全的设备环境
为什么会发生此编译错误？因为每个线程请求的本地内存太多，而find
内核请求的内存要比这个多得多（超过64KB）
当我最初在我的设备上尝试它时，我使用的是一个具有较高限制（每个线程512KB）的cc2.0设备，因此JIT编译步骤成功
一般来说，我建议同时指定虚拟体系结构和机器体系结构，实现这一点的简写方法是：
nvcc -arch=sm_11 ....

（对于cc1.1设备）
这可能也很有趣，并且提供了有关虚拟机与机器体系结构的更多详细信息，以及如何为每个虚拟机指定编译阶段
我相信，当您注释掉内核中的那些特定行时，错误消失的原因是，使用这些注释掉的行，编译器能够优化对这些本地内存区域的访问，并优化本地内存的实例化。这允许JIT编译步骤成功完成，并且您的代码可以运行s“无运行时错误”
您可以通过注释掉这些行来验证这一点，然后指定完整的编译（nvcc-arch=sm_11…
），其中-arch
是--gpu体系结构
的缩写。此错误通常意味着内核是为错误的体系结构编译的。您需要，然后为该体系结构编译它。例如，如果您的gpu具有计算能力1.1，则使用-arch=sm_11编译它。您还可以为多个体系结构构建可执行文件体系结构。
这可能与错误无关，但您这句话的意思是什么：如果（0感谢您的评论，这是一个错误，但无法解决问题。我无法重现错误。它编译并运行时不会出错。什么CUDA版本？什么操作系统？什么GPU？设备0：“GeForce 9800 GT”CUDA驱动程序版本/运行时版本5.5/5.5 Kubuntu 12.04