Cuda 当我编译这段代码时,这里出现了一些错误:1)表达式必须是可修改的左值2)标识符__synchthreads";是未定义的 #包括 使用名称空间std; #定义最小值(x,y)(x>y?x:y) #定义n33*1024 #定义ThreadPerBlock 256 //大于或等于N的threadsPerBlock的最小倍数 #定义blockPerGrid最小值(32,(N+ThreadPerBlock-1)/ThreadPerBlock) __全局无效向量点积(常数浮点*V1,常数浮点*V2,浮点*V3) { __共享_uuuuuu浮动槽[螺纹块]; 浮子温度; const unsigned int tid=blockDim.x*blockIdx.x+threadIdx.x; 常量unsigned int chacheindex=threadIdx.x; 而(tid

Cuda 当我编译这段代码时,这里出现了一些错误:1)表达式必须是可修改的左值2)标识符__synchthreads";是未定义的 #包括 使用名称空间std; #定义最小值(x,y)(x>y?x:y) #定义n33*1024 #定义ThreadPerBlock 256 //大于或等于N的threadsPerBlock的最小倍数 #定义blockPerGrid最小值(32,(N+ThreadPerBlock-1)/ThreadPerBlock) __全局无效向量点积(常数浮点*V1,常数浮点*V2,浮点*V3) { __共享_uuuuuu浮动槽[螺纹块]; 浮子温度; const unsigned int tid=blockDim.x*blockIdx.x+threadIdx.x; 常量unsigned int chacheindex=threadIdx.x; 而(tid,cuda,Cuda,关于这一点: 标识符“\uuuSynchThreads”未定义 无论您身在何处: #include <iostream> using namespace std ; #define min(x,y) (x>y?x:y) #define N 33*1024 #define ThreadPerBlock 256 //smallest multiple of threadsPerBlock that is greater than or equal to N #define b

关于这一点:

标识符“\uuuSynchThreads”未定义

无论您身在何处:

#include <iostream>
using namespace std ;

#define min(x,y) (x>y?x:y)
#define N 33*1024

#define ThreadPerBlock 256

//smallest multiple of threadsPerBlock that is greater than or equal to N
#define blockPerGrid min(32 , (N+ThreadPerBlock-1) / ThreadPerBlock )

__global__ void Vector_Dot_Product ( const float *V1 , const float *V2 , float *V3   )
{
 __shared__ float chache[ThreadPerBlock] ;

float temp ;

 const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x ;

 const unsigned int chacheindex = threadIdx.x ;

 while ( tid < N )
{
  temp += V1[tid] * V2[tid] ;

  tid += blockDim.x * gridDim.x ;
 }

  chache[chacheindex] = temp ;

 __synchthreads () ;

 int i  = blockDim.x / 2 ;

while ( i!=0 )
 {

  if ( chacheindex < i )
     chache[chacheindex] += chache [chacheindex + i] ;

 __synchthreads () ;

   i/=2 ;
 }

  if ( chacheindex == 0 )
     V3[blockIdx.x] = chache [0] ;


 }

 int main ( int argv , char *argc )
 {
  float *V1_H , *V2_H , *V3_H ;
  float *V1_D , *V2_D , *V3_D ;

  V1_H = new float [N]  ;
  V2_H = new float [N]  ;
  V3_H = new float [blockPerGrid]  ;

  cudaMalloc ( (void **)&V1_D , N*sizeof(float)) ;

  cudaMalloc ( (void **)&V2_D , N*sizeof(float)) ;

  cudaMalloc ( (void **)&V3_D , blockPerGrid*sizeof(float)) ;

  for ( int i = 0 ; i<N ; i++ )
  {

          V1_H[i] = i ;

          V2_H[i] = i*2 ;
   } 

   cudaMemcpy ( V1_D , V1_H , N*sizeof(float) , cudaMemcpyHostToDevice ) ;

   cudaMemcpy ( V2_D , V2_H , N*sizeof(float) , cudaMemcpyHostToDevice ) ;

   Vector_Dot_Product <<<blockPerGrid , ThreadPerBlock >>> (V1_D , V2_D , V3_D ) ;

    cudaMemcpy ( V3_H , V3_D , N*sizeof(float) , cudaMemcpyDeviceToHost ) ;

    cout <<"\n Vector Dot Prodcut is : " ;

    float sum = 0 ;

    for ( int i = 0 ; i<blockPerGrid ; i++ )
           sum+=V3_H[i] ;
    cout << sum << endl ;

    cudaFree ( V1_D) ;
    cudaFree ( V2_D) ;
    cudaFree ( V3_D) ;

       delete [] V1_H ;
       delete [] V2_H ;
       delete [] V3_H ;

    }
您应该将其更改为:

__synchthreads();
关于这一点:

表达式必须是可修改的左值

由于您在此处将
tid
定义为
const

__syncthreads();
不允许您在此处尝试更改它:

const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x ;
因此,最简单的解决方案可能是从
tid
定义中删除
const

tid += blockDim.x * gridDim.x ;

syncthreads有一个h而不是2您已将tid定义为常量,因此无法在内核中修改它(tid+=…)
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x ;