Cuda 当我编译这段代码时,这里出现了一些错误:1)表达式必须是可修改的左值2)标识符__synchthreads";是未定义的 #包括 使用名称空间std; #定义最小值(x,y)(x>y?x:y) #定义n33*1024 #定义ThreadPerBlock 256 //大于或等于N的threadsPerBlock的最小倍数 #定义blockPerGrid最小值(32,(N+ThreadPerBlock-1)/ThreadPerBlock) __全局无效向量点积(常数浮点*V1,常数浮点*V2,浮点*V3) { __共享_uuuuuu浮动槽[螺纹块]; 浮子温度; const unsigned int tid=blockDim.x*blockIdx.x+threadIdx.x; 常量unsigned int chacheindex=threadIdx.x; 而(tid
关于这一点: 标识符“\uuuSynchThreads”未定义 无论您身在何处:Cuda 当我编译这段代码时,这里出现了一些错误:1)表达式必须是可修改的左值2)标识符__synchthreads";是未定义的 #包括 使用名称空间std; #定义最小值(x,y)(x>y?x:y) #定义n33*1024 #定义ThreadPerBlock 256 //大于或等于N的threadsPerBlock的最小倍数 #定义blockPerGrid最小值(32,(N+ThreadPerBlock-1)/ThreadPerBlock) __全局无效向量点积(常数浮点*V1,常数浮点*V2,浮点*V3) { __共享_uuuuuu浮动槽[螺纹块]; 浮子温度; const unsigned int tid=blockDim.x*blockIdx.x+threadIdx.x; 常量unsigned int chacheindex=threadIdx.x; 而(tid,cuda,Cuda,关于这一点: 标识符“\uuuSynchThreads”未定义 无论您身在何处: #include <iostream> using namespace std ; #define min(x,y) (x>y?x:y) #define N 33*1024 #define ThreadPerBlock 256 //smallest multiple of threadsPerBlock that is greater than or equal to N #define b
#include <iostream>
using namespace std ;
#define min(x,y) (x>y?x:y)
#define N 33*1024
#define ThreadPerBlock 256
//smallest multiple of threadsPerBlock that is greater than or equal to N
#define blockPerGrid min(32 , (N+ThreadPerBlock-1) / ThreadPerBlock )
__global__ void Vector_Dot_Product ( const float *V1 , const float *V2 , float *V3 )
{
__shared__ float chache[ThreadPerBlock] ;
float temp ;
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x ;
const unsigned int chacheindex = threadIdx.x ;
while ( tid < N )
{
temp += V1[tid] * V2[tid] ;
tid += blockDim.x * gridDim.x ;
}
chache[chacheindex] = temp ;
__synchthreads () ;
int i = blockDim.x / 2 ;
while ( i!=0 )
{
if ( chacheindex < i )
chache[chacheindex] += chache [chacheindex + i] ;
__synchthreads () ;
i/=2 ;
}
if ( chacheindex == 0 )
V3[blockIdx.x] = chache [0] ;
}
int main ( int argv , char *argc )
{
float *V1_H , *V2_H , *V3_H ;
float *V1_D , *V2_D , *V3_D ;
V1_H = new float [N] ;
V2_H = new float [N] ;
V3_H = new float [blockPerGrid] ;
cudaMalloc ( (void **)&V1_D , N*sizeof(float)) ;
cudaMalloc ( (void **)&V2_D , N*sizeof(float)) ;
cudaMalloc ( (void **)&V3_D , blockPerGrid*sizeof(float)) ;
for ( int i = 0 ; i<N ; i++ )
{
V1_H[i] = i ;
V2_H[i] = i*2 ;
}
cudaMemcpy ( V1_D , V1_H , N*sizeof(float) , cudaMemcpyHostToDevice ) ;
cudaMemcpy ( V2_D , V2_H , N*sizeof(float) , cudaMemcpyHostToDevice ) ;
Vector_Dot_Product <<<blockPerGrid , ThreadPerBlock >>> (V1_D , V2_D , V3_D ) ;
cudaMemcpy ( V3_H , V3_D , N*sizeof(float) , cudaMemcpyDeviceToHost ) ;
cout <<"\n Vector Dot Prodcut is : " ;
float sum = 0 ;
for ( int i = 0 ; i<blockPerGrid ; i++ )
sum+=V3_H[i] ;
cout << sum << endl ;
cudaFree ( V1_D) ;
cudaFree ( V2_D) ;
cudaFree ( V3_D) ;
delete [] V1_H ;
delete [] V2_H ;
delete [] V3_H ;
}
您应该将其更改为:
__synchthreads();
关于这一点:
表达式必须是可修改的左值
由于您在此处将tid
定义为const
:
__syncthreads();
不允许您在此处尝试更改它:
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x ;
因此,最简单的解决方案可能是从tid
定义中删除const
:
tid += blockDim.x * gridDim.x ;
syncthreads有一个h而不是2您已将tid定义为常量,因此无法在内核中修改它(tid+=…)
unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x ;