Cuda __同步线程似乎不起作用
我有一个简单的内核,它应该计算AB数组的乘法和,但是_syncthreads()似乎根本不起作用,我调试了它,temp[I]为一些元素返回未初始化的值。如果我省略了uu syncthreads(),结果是相同的。(我检查过,cuda代码的所有其他部分(如数组初始化、复制到内存等)都写得很好,所以问题出在这个内核中)(注意:我不想使用atomicAdd)Cuda __同步线程似乎不起作用,cuda,Cuda,我有一个简单的内核,它应该计算AB数组的乘法和,但是_syncthreads()似乎根本不起作用,我调试了它,temp[I]为一些元素返回未初始化的值。如果我省略了uu syncthreads(),结果是相同的。(我检查过,cuda代码的所有其他部分(如数组初始化、复制到内存等)都写得很好,所以问题出在这个内核中)(注意:我不想使用atomicAdd) #包括 #包括 #包括 #包括 #包括“cuda_runtime.h” #包括“设备启动参数.h” #定义MINREAL-1024.0 #定义M
#包括
#包括
#包括
#包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#定义MINREAL-1024.0
#定义MAXREAL 1024.0
#定义精度为0.01
#定义\u GPU\u线程的数量256
无效检查CUDAERROR(常量字符*msg){
cudaError_t err=cudaGetLastError();
如果(cudaSuccess!=错误){
fprintf(标准,“Cuda错误:%s:%s.\n”,消息,cudaGetErrorString(err));
退出(退出失败);
}
}
void vecFillRand(整数N,浮点*vec){
int i;
对于(i=0;i=gridDim.x*blockDim.x)
返回;
temp[threadIdx.x]=a[idx]*b[idx];
__同步线程();
if(threadIdx.x==0){
c[blockIdx.x]=0.0f;
for(int i=0;i精度){
printf(“测试失败:%f\n”,值);
}
否则{
printf(“测试通过\n”);
}
库达弗里(杜阿);
库达弗里(杜布);
库达弗里(d_C);
免费(h_A);
免费(h_B);
免费(h_C);
返回0;
}
您的内核需要稍微重新编写
“最后一个块”将杀死任何超过输入向量长度的线程,但对最后一个块的元素求和的for循环没有正确检查以确保它没有超过向量长度。这会导致越界读取访问,当您使用cuda memcheck运行代码时会出现这种情况(假设您输入的向量大小不是256的倍数)
此外,不应在条件代码中使用\uuuu syncthreads()
,除非该条件在块中的所有线程中计算相同的值。最后一个块将违反此规则,因为向量长度不是256的倍数
除此之外,对于较大的向量大小,您对float
数量的精度期望过高(位数太多)。您需要根据相加的浮点值的数量来调整精度测试
下面是代码的修改版本,其中包括我上面提到的更改,这些更改
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define MINREAL -1024.0
#define MAXREAL 1024.0
#define ACCURACY 0.01
#define NUM_OF_GPU_THREADS 256
void checkCUDAError(const char *msg) {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err){
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
void vecFillRand(int N, float *vec) {
int i;
for(i = 0; i < N; i++)
vec[i] = (rand() / (float) RAND_MAX)*(MAXREAL - MINREAL) + MINREAL;
}
float seq_dotProduct(float *a, float *b, int n) {
int i;
float dp;
dp = 0;
for(i = 0; i < n; i++) {
dp += a[i] * b[i];
}
return dp;
}
// krenel
__global__ void dotProduct(float *a, float *b, float *c) {
__shared__ float temp[NUM_OF_GPU_THREADS];
int idx = threadIdx.x + blockIdx.x * blockDim.x;
float t;
if (idx >= gridDim.x * blockDim.x)
return;
temp[threadIdx.x] = a[idx] * b[idx];
__syncthreads();
if(threadIdx.x == 0) {
c[blockIdx.x] = 0.0f;
for(int i = 0; i < NUM_OF_GPU_THREADS; i++){
t = temp[i];
c[blockIdx.x] = c[blockIdx.x] + t;
}
}
}
int main(int argc, char* argv[]) {
int i, n, ARRAY_BYTES;
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
float sum;
float seq_sum;
clock_t t;
srand(time(NULL));
if (argc == 2) {
n = atoi(argv[1]);
} else {
printf("N? ");
fflush(stdout);
scanf("%d", &n);
}
int BLOCKS_PER_GRID = (unsigned int)ceil(n/(float)NUM_OF_GPU_THREADS);
// arrays n host
ARRAY_BYTES = n * sizeof(float);
h_A = (float *) malloc(ARRAY_BYTES);
h_B = (float *) malloc(ARRAY_BYTES);
h_C = (float *) malloc(BLOCKS_PER_GRID * sizeof(float));
printf("\ncreating A and B...\n\n");
vecFillRand(n, h_A);
vecFillRand(n, h_B);
vecFillRand(BLOCKS_PER_GRID, h_C);
// arrays on device
cudaMalloc((void**) &d_A, ARRAY_BYTES);
cudaMalloc((void**) &d_B, ARRAY_BYTES);
cudaMalloc((void**) &d_C, BLOCKS_PER_GRID * sizeof(float));
// transfer the arrays to the GPU
cudaMemcpy(d_A, h_A, ARRAY_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, ARRAY_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, BLOCKS_PER_GRID, cudaMemcpyHostToDevice);
// TIME START
// create events for timing execution
cudaEvent_t start = cudaEvent_t();
cudaEvent_t stop = cudaEvent_t();
cudaEventCreate( &start );
cudaEventCreate( &stop );
// record time into start event
cudaEventRecord( start, 0 ); // 0 is the default stream id
// launch the kernel
dim3 block(NUM_OF_GPU_THREADS); // 256, 1, 1
dim3 grid(BLOCKS_PER_GRID);
printf ("computing dotProduct... \n");
dotProduct<<<grid, block>>>(d_A, d_B, d_C);
// block until the device has completed
cudaThreadSynchronize();
// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError("kernel invocation");
// TIME END
// record time into stop event
cudaEventRecord( stop, 0 );
// synchronize stop event to wait for end of kernel execution on stream 0
cudaEventSynchronize( stop );
// compute elapsed time (done by CUDA run-time)
float elapsed_kernel = 0.f;
cudaEventElapsedTime( &elapsed_kernel, start, stop );
// release events
cudaEventDestroy( start );
cudaEventDestroy( stop );
// print krenel time
printf("CUDA TIME: %f \n\n", elapsed_kernel/1000);
// copy back the result array to the CPU
cudaMemcpy(h_C, d_C, BLOCKS_PER_GRID * sizeof(float), cudaMemcpyDeviceToHost );
// Check for any CUDA errors
checkCUDAError("memcpy");
// compute sum
sum = 0;
for (i = 0; i < BLOCKS_PER_GRID; i++)
sum += h_C[i];
// launch sequential
t = clock();
printf ("computing seq_dotProduct... \n");
seq_sum = seq_dotProduct(h_A, h_B, n);
t = clock() - t;
printf ("SEQ TIME: %f \n\n",((float)t)/CLOCKS_PER_SEC);
// check sum and seq_sum
float value = abs(sum - seq_sum);
if (value > ACCURACY) {
printf("Test FAILED: %f \n", value);
}
else{
printf("Test PASSED \n");
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
//#include "cuda_runtime.h"
//#include "device_launch_parameters.h"
#define MINREAL -1024.0
#define MAXREAL 1024.0
#define FAST_RED
#define ACCURACY 0.0001
#define NUM_OF_GPU_THREADS 256
void checkCUDAError(const char *msg) {
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err){
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
void vecFillRand(int N, float *vec) {
int i;
for(i = 0; i < N; i++)
vec[i] = (rand() / (float) RAND_MAX)*(MAXREAL - MINREAL) + MINREAL;
}
float seq_dotProduct(float *a, float *b, int n) {
int i;
float dp;
dp = 0;
for(i = 0; i < n; i++) {
dp += a[i] * b[i];
}
return dp;
}
// krenel
__global__ void dotProduct(float *a, float *b, float *c, int n) {
__shared__ float temp[NUM_OF_GPU_THREADS];
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < n)
temp[threadIdx.x] = a[idx] * b[idx];
else temp[threadIdx.x] = 0.0f;
__syncthreads();
#ifdef FAST_RED
// assumes block dimension is a power of 2
for (int i = blockDim.x>>1; i > 0; i >>= 1){
if (threadIdx.x < i) temp[threadIdx.x] += temp[threadIdx.x+i];
__syncthreads();}
if (threadIdx.x == 0) c[blockIdx.x] = temp[0];
#else
float t;
if(threadIdx.x == 0) {
c[blockIdx.x] = 0.0f;
int j=0;
for(int i = blockIdx.x*blockDim.x; ((i < ((blockIdx.x+1)*blockDim.x)) && (i < n)); i++){
t = temp[j++];
c[blockIdx.x] = c[blockIdx.x] + t;
}
}
#endif
}
int main(int argc, char* argv[]) {
int i, n, ARRAY_BYTES;
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
float sum;
float seq_sum;
clock_t t;
srand(time(NULL));
if (argc == 2) {
n = atoi(argv[1]);
} else {
printf("N? ");
fflush(stdout);
scanf("%d", &n);
}
int BLOCKS_PER_GRID = (unsigned int)ceil(n/(float)NUM_OF_GPU_THREADS);
printf("bpg = %d\n", BLOCKS_PER_GRID);
// arrays n host
ARRAY_BYTES = n * sizeof(float);
h_A = (float *) malloc(ARRAY_BYTES);
h_B = (float *) malloc(ARRAY_BYTES);
h_C = (float *) malloc(BLOCKS_PER_GRID * sizeof(float));
printf("\ncreating A and B...\n\n");
vecFillRand(n, h_A);
vecFillRand(n, h_B);
vecFillRand(BLOCKS_PER_GRID, h_C);
// arrays on device
cudaMalloc((void**) &d_A, ARRAY_BYTES);
cudaMalloc((void**) &d_B, ARRAY_BYTES);
cudaMalloc((void**) &d_C, BLOCKS_PER_GRID * sizeof(float));
// transfer the arrays to the GPU
cudaMemcpy(d_A, h_A, ARRAY_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, ARRAY_BYTES, cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, BLOCKS_PER_GRID * sizeof(float), cudaMemcpyHostToDevice);
// TIME START
// create events for timing execution
cudaEvent_t start = cudaEvent_t();
cudaEvent_t stop = cudaEvent_t();
cudaEventCreate( &start );
cudaEventCreate( &stop );
// record time into start event
cudaEventRecord( start, 0 ); // 0 is the default stream id
// launch the kernel
dim3 block(NUM_OF_GPU_THREADS); // 256, 1, 1
dim3 grid(BLOCKS_PER_GRID);
printf ("computing dotProduct... \n");
dotProduct<<<grid, block>>>(d_A, d_B, d_C, n);
// block until the device has completed
cudaDeviceSynchronize();
// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError("kernel invocation");
// TIME END
// record time into stop event
cudaEventRecord( stop, 0 );
// synchronize stop event to wait for end of kernel execution on stream 0
cudaEventSynchronize( stop );
// compute elapsed time (done by CUDA run-time)
float elapsed_kernel = 0.f;
cudaEventElapsedTime( &elapsed_kernel, start, stop );
// release events
cudaEventDestroy( start );
cudaEventDestroy( stop );
// print krenel time
printf("CUDA TIME: %f \n\n", elapsed_kernel/1000);
// copy back the result array to the CPU
cudaMemcpy(h_C, d_C, BLOCKS_PER_GRID * sizeof(float), cudaMemcpyDeviceToHost );
// Check for any CUDA errors
checkCUDAError("memcpy");
// compute sum
sum = 0;
for (i = 0; i < BLOCKS_PER_GRID; i++)
sum += h_C[i];
// launch sequential
t = clock();
printf ("computing seq_dotProduct... \n");
seq_sum = seq_dotProduct(h_A, h_B, n);
t = clock() - t;
printf ("SEQ TIME: %f \n\n",((float)t)/CLOCKS_PER_SEC);
// check sum and seq_sum
float value = abs((sum - seq_sum)/sum);
if (value > ACCURACY) {
printf("Test FAILED: err: %f cpu: %f gpu: %f \n", value, seq_sum, sum);
}
else{
printf("Test PASSED \n");
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
return 0;
}