Cuda __同步线程似乎不起作用

Cuda __同步线程似乎不起作用,cuda,Cuda,我有一个简单的内核,它应该计算AB数组的乘法和,但是_syncthreads()似乎根本不起作用,我调试了它,temp[I]为一些元素返回未初始化的值。如果我省略了uu syncthreads(),结果是相同的。(我检查过,cuda代码的所有其他部分(如数组初始化、复制到内存等)都写得很好,所以问题出在这个内核中)(注意:我不想使用atomicAdd) #包括 #包括 #包括 #包括 #包括“cuda_runtime.h” #包括“设备启动参数.h” #定义MINREAL-1024.0 #定义M

我有一个简单的内核,它应该计算AB数组的乘法和,但是_syncthreads()似乎根本不起作用,我调试了它,temp[I]为一些元素返回未初始化的值。如果我省略了uu syncthreads(),结果是相同的。(我检查过,cuda代码的所有其他部分(如数组初始化、复制到内存等)都写得很好,所以问题出在这个内核中)(注意:我不想使用atomicAdd)

#包括
#包括
#包括
#包括
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#定义MINREAL-1024.0
#定义MAXREAL 1024.0
#定义精度为0.01
#定义\u GPU\u线程的数量256
无效检查CUDAERROR(常量字符*msg){
cudaError_t err=cudaGetLastError();
如果(cudaSuccess!=错误){
fprintf(标准,“Cuda错误:%s:%s.\n”,消息,cudaGetErrorString(err));
退出(退出失败);
} 
} 
void vecFillRand(整数N,浮点*vec){
int i;
对于(i=0;i=gridDim.x*blockDim.x)
返回;
temp[threadIdx.x]=a[idx]*b[idx];
__同步线程();
if(threadIdx.x==0){
c[blockIdx.x]=0.0f;
for(int i=0;i精度){
printf(“测试失败:%f\n”,值);
}
否则{
printf(“测试通过\n”);
} 
库达弗里(杜阿);
库达弗里(杜布);
库达弗里(d_C);
免费(h_A);
免费(h_B);
免费(h_C);
返回0;
}

您的内核需要稍微重新编写

“最后一个块”将杀死任何超过输入向量长度的线程,但对最后一个块的元素求和的for循环没有正确检查以确保它没有超过向量长度。这会导致越界读取访问,当您使用cuda memcheck运行代码时会出现这种情况(假设您输入的向量大小不是256的倍数)

此外,不应在条件代码中使用
\uuuu syncthreads()
,除非该条件在块中的所有线程中计算相同的值。最后一个块将违反此规则,因为向量长度不是256的倍数

除此之外,对于较大的向量大小,您对
float
数量的精度期望过高(位数太多)。您需要根据相加的浮点值的数量来调整
精度测试

下面是代码的修改版本,其中包括我上面提到的更改,这些更改
 #include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h> 

#include "cuda_runtime.h"
#include "device_launch_parameters.h"  

#define MINREAL -1024.0
#define MAXREAL 1024.0

#define ACCURACY 0.01

#define NUM_OF_GPU_THREADS 256 

void checkCUDAError(const char *msg) {
    cudaError_t err = cudaGetLastError(); 
    if( cudaSuccess != err){
        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) ); 
        exit(EXIT_FAILURE); 
    } 
} 

void vecFillRand(int N, float *vec) {
    int i;
    for(i = 0; i < N; i++) 
        vec[i] = (rand() / (float) RAND_MAX)*(MAXREAL - MINREAL) + MINREAL;
}

float seq_dotProduct(float *a, float *b, int n) {
    int i;
    float dp;
    dp = 0;
    for(i = 0; i < n; i++) {
        dp += a[i] * b[i];
    }
    return dp;
}

// krenel
__global__ void dotProduct(float *a, float *b, float *c) {
    __shared__ float temp[NUM_OF_GPU_THREADS];
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    float t;

    if (idx >= gridDim.x * blockDim.x)
        return;

    temp[threadIdx.x] = a[idx] * b[idx];

    __syncthreads();

    if(threadIdx.x == 0) {
        c[blockIdx.x] = 0.0f;
        for(int i = 0; i < NUM_OF_GPU_THREADS; i++){
            t = temp[i];
            c[blockIdx.x] = c[blockIdx.x] + t;
        }
    }
}

int main(int argc, char* argv[]) {
    int i, n, ARRAY_BYTES;
    float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
    float sum;
    float seq_sum;
    clock_t t;

    srand(time(NULL));

    if (argc == 2) {
        n = atoi(argv[1]);
    } else {
        printf("N? ");
        fflush(stdout);
        scanf("%d", &n);    
    }

    int BLOCKS_PER_GRID = (unsigned int)ceil(n/(float)NUM_OF_GPU_THREADS);

    // arrays n host
    ARRAY_BYTES = n * sizeof(float);
    h_A = (float *) malloc(ARRAY_BYTES);
    h_B = (float *) malloc(ARRAY_BYTES);
    h_C = (float *) malloc(BLOCKS_PER_GRID * sizeof(float));
    printf("\ncreating A and B...\n\n");
    vecFillRand(n, h_A);
    vecFillRand(n, h_B);
    vecFillRand(BLOCKS_PER_GRID, h_C);

    // arrays on device
    cudaMalloc((void**) &d_A, ARRAY_BYTES);
    cudaMalloc((void**) &d_B, ARRAY_BYTES);
    cudaMalloc((void**) &d_C, BLOCKS_PER_GRID * sizeof(float));

    // transfer the arrays to the GPU
    cudaMemcpy(d_A, h_A, ARRAY_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, ARRAY_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_C, h_C, BLOCKS_PER_GRID, cudaMemcpyHostToDevice);

    // TIME START
    // create events for timing execution 
    cudaEvent_t start = cudaEvent_t(); 
    cudaEvent_t stop = cudaEvent_t(); 
    cudaEventCreate( &start ); 
    cudaEventCreate( &stop ); 
    // record time into start event 
    cudaEventRecord( start, 0 ); // 0 is the default stream id 

    // launch the kernel
    dim3 block(NUM_OF_GPU_THREADS); // 256, 1, 1
    dim3 grid(BLOCKS_PER_GRID);  
    printf ("computing dotProduct... \n");
    dotProduct<<<grid, block>>>(d_A, d_B, d_C);

    // block until the device has completed 
    cudaThreadSynchronize();    

    // check if kernel execution generated an error 
    // Check for any CUDA errors 
    checkCUDAError("kernel invocation"); 

    // TIME END
    // record time into stop event 
    cudaEventRecord( stop, 0 ); 
    // synchronize stop event to wait for end of kernel execution on stream 0 
    cudaEventSynchronize( stop ); 
    // compute elapsed time (done by CUDA run-time) 
    float elapsed_kernel = 0.f; 
    cudaEventElapsedTime( &elapsed_kernel, start, stop ); 
    // release events 
    cudaEventDestroy( start ); 
    cudaEventDestroy( stop ); 
    // print krenel time
    printf("CUDA TIME: %f \n\n", elapsed_kernel/1000);

    // copy back the result array to the CPU
    cudaMemcpy(h_C, d_C, BLOCKS_PER_GRID * sizeof(float), cudaMemcpyDeviceToHost );

    // Check for any CUDA errors 
    checkCUDAError("memcpy");

    // compute sum
    sum = 0;
    for (i = 0; i < BLOCKS_PER_GRID; i++)
        sum += h_C[i];

    //  launch sequential
    t = clock();
    printf ("computing seq_dotProduct... \n");
    seq_sum = seq_dotProduct(h_A, h_B, n);
    t = clock() - t;
    printf ("SEQ TIME: %f \n\n",((float)t)/CLOCKS_PER_SEC);

    // check sum and seq_sum    
    float value = abs(sum - seq_sum);
    if (value > ACCURACY) {
        printf("Test FAILED: %f \n", value);        
    }
    else{
        printf("Test PASSED \n");
    } 

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(h_C);  

    return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

//#include "cuda_runtime.h"
//#include "device_launch_parameters.h"

#define MINREAL -1024.0
#define MAXREAL 1024.0
#define FAST_RED
#define ACCURACY 0.0001

#define NUM_OF_GPU_THREADS 256

void checkCUDAError(const char *msg) {
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err){
        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
        exit(EXIT_FAILURE);
    }
}

void vecFillRand(int N, float *vec) {
    int i;
    for(i = 0; i < N; i++)
        vec[i] = (rand() / (float) RAND_MAX)*(MAXREAL - MINREAL) + MINREAL;
}

float seq_dotProduct(float *a, float *b, int n) {
    int i;
    float dp;
    dp = 0;
    for(i = 0; i < n; i++) {
        dp += a[i] * b[i];
    }
    return dp;
}

// krenel
__global__ void dotProduct(float *a, float *b, float *c, int n) {
    __shared__ float temp[NUM_OF_GPU_THREADS];
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < n)
      temp[threadIdx.x] = a[idx] * b[idx];
    else temp[threadIdx.x] = 0.0f;

    __syncthreads();
#ifdef FAST_RED
    // assumes block dimension is a power of 2
    for (int i = blockDim.x>>1; i > 0; i >>= 1){
      if (threadIdx.x < i) temp[threadIdx.x] += temp[threadIdx.x+i];
      __syncthreads();}
    if (threadIdx.x == 0) c[blockIdx.x] = temp[0];
#else
    float t;
    if(threadIdx.x == 0) {
        c[blockIdx.x] = 0.0f;
        int j=0;
        for(int i = blockIdx.x*blockDim.x; ((i < ((blockIdx.x+1)*blockDim.x)) && (i < n)); i++){
            t = temp[j++];
            c[blockIdx.x] = c[blockIdx.x] + t;
        }
    }
#endif
}

int main(int argc, char* argv[]) {
    int i, n, ARRAY_BYTES;
    float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
    float sum;
    float seq_sum;
    clock_t t;

    srand(time(NULL));

    if (argc == 2) {
        n = atoi(argv[1]);
    } else {
        printf("N? ");
        fflush(stdout);
        scanf("%d", &n);
    }

    int BLOCKS_PER_GRID = (unsigned int)ceil(n/(float)NUM_OF_GPU_THREADS);
    printf("bpg = %d\n", BLOCKS_PER_GRID);

    // arrays n host
    ARRAY_BYTES = n * sizeof(float);
    h_A = (float *) malloc(ARRAY_BYTES);
    h_B = (float *) malloc(ARRAY_BYTES);
    h_C = (float *) malloc(BLOCKS_PER_GRID * sizeof(float));
    printf("\ncreating A and B...\n\n");
    vecFillRand(n, h_A);
    vecFillRand(n, h_B);
    vecFillRand(BLOCKS_PER_GRID, h_C);

    // arrays on device
    cudaMalloc((void**) &d_A, ARRAY_BYTES);
    cudaMalloc((void**) &d_B, ARRAY_BYTES);
    cudaMalloc((void**) &d_C, BLOCKS_PER_GRID * sizeof(float));

    // transfer the arrays to the GPU
    cudaMemcpy(d_A, h_A, ARRAY_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, ARRAY_BYTES, cudaMemcpyHostToDevice);
    cudaMemcpy(d_C, h_C, BLOCKS_PER_GRID * sizeof(float), cudaMemcpyHostToDevice);

    // TIME START
    // create events for timing execution
    cudaEvent_t start = cudaEvent_t();
    cudaEvent_t stop = cudaEvent_t();
    cudaEventCreate( &start );
    cudaEventCreate( &stop );
    // record time into start event
    cudaEventRecord( start, 0 ); // 0 is the default stream id

    // launch the kernel
    dim3 block(NUM_OF_GPU_THREADS); // 256, 1, 1
    dim3 grid(BLOCKS_PER_GRID);
    printf ("computing dotProduct... \n");
    dotProduct<<<grid, block>>>(d_A, d_B, d_C, n);

    // block until the device has completed
    cudaDeviceSynchronize();

    // check if kernel execution generated an error
    // Check for any CUDA errors
    checkCUDAError("kernel invocation");

    // TIME END
    // record time into stop event
    cudaEventRecord( stop, 0 );
    // synchronize stop event to wait for end of kernel execution on stream 0
    cudaEventSynchronize( stop );
    // compute elapsed time (done by CUDA run-time)
    float elapsed_kernel = 0.f;
    cudaEventElapsedTime( &elapsed_kernel, start, stop );
    // release events
    cudaEventDestroy( start );
    cudaEventDestroy( stop );
    // print krenel time
    printf("CUDA TIME: %f \n\n", elapsed_kernel/1000);

    // copy back the result array to the CPU
    cudaMemcpy(h_C, d_C, BLOCKS_PER_GRID * sizeof(float), cudaMemcpyDeviceToHost );

    // Check for any CUDA errors
    checkCUDAError("memcpy");

    // compute sum
    sum = 0;
    for (i = 0; i < BLOCKS_PER_GRID; i++)
        sum += h_C[i];

    //  launch sequential
    t = clock();
    printf ("computing seq_dotProduct... \n");
    seq_sum = seq_dotProduct(h_A, h_B, n);
    t = clock() - t;
    printf ("SEQ TIME: %f \n\n",((float)t)/CLOCKS_PER_SEC);

    // check sum and seq_sum
    float value = abs((sum - seq_sum)/sum);
    if (value > ACCURACY) {
        printf("Test FAILED: err: %f cpu: %f  gpu: %f \n", value, seq_sum, sum);
    }
    else{
        printf("Test PASSED \n");
    }

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(h_A);
    free(h_B);
    free(h_C);

    return 0;
}