改进的reduce0 CUDA还原代码不工作

改进的reduce0 CUDA还原代码不工作,cuda,gpu,Cuda,Gpu,我写了一个CUDA代码,基本上为我求一个数组的和。数组大小N应该是2的幂,即2^x。但是,我的代码工作不正常。例如,如果输出为150177410,则我的代码输出150177408。在过去的5个小时里,我一直在尝试调试它。任何帮助都将不胜感激。代码如下: //only for array size of 2^x and TPB of 2^y as godata is = num of blocks. But num of blocks 2^sth if previous satisfied //W

我写了一个CUDA代码,基本上为我求一个数组的和。数组大小
N
应该是
2
的幂,即
2^x
。但是,我的代码工作不正常。例如,如果输出为
150177410
,则我的代码输出
150177408
。在过去的
5个小时里,我一直在尝试调试它。任何帮助都将不胜感激。代码如下:

//only for array size of 2^x and TPB of 2^y as godata is = num of blocks. But num of blocks 2^sth if previous satisfied
//Works for arbitrary size array of type 2^x


#include<stdio.h>

__global__ void computeAddShared(int *in , int *out, int sizeInput){
    //not made parameters gidata and godata to emphasize that parameters get copy of address and are different from pointers in host code
    extern __shared__ float temp[];

    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    int ltid = threadIdx.x;
    temp[ltid] = 0;
    while(tid < sizeInput){
        temp[ltid] += in[tid];
        tid+=gridDim.x * blockDim.x; // to handle array of any size
    }
    __syncthreads();
    int offset = 1;
    while(offset < blockDim.x){
        if(ltid % (offset * 2) == 0){
            temp[ltid] = temp[ltid] + temp[ltid + offset];
        }
        __syncthreads();
        offset*=2;
    }
    if(ltid == 0){
        out[blockIdx.x] = temp[0];
    }

}

int main(){



    int N = 8192;//should be 2^sth
    int size = N;
    int *a = (int*)malloc(N * sizeof(int));
    /* TO create random number
    FILE *f;
        f = fopen("invertedList.txt" , "w");
        a[0] = 1 + (rand() % 8);
        fprintf(f, "%d,",a[0]);
        for( int i = 1 ; i< N; i++){
            a[i] = a[i-1] + (rand() % 8) + 1;
            fprintf(f, "%d,",a[i]);
        }
        fclose(f);
        return 0;*/
    FILE *f;
    f = fopen("invertedList.txt","r");
    if( f == NULL){
            printf("File not found\n");
            system("pause");
            exit(1);
    }
    int count = 0 ;
    long actualSum = 0;
    for( int i =0 ; i < N ; i++){
        fscanf(f, "%d,", &a[count]);
        actualSum+=a[count];
        count++;
    }
    fclose(f);
    printf("The actual sum is %d\n",actualSum);
    int* gidata;
    int* godata;
    cudaMalloc((void**)&gidata, N* sizeof(int));
    cudaMemcpy(gidata,a, size * sizeof(int), cudaMemcpyHostToDevice);
    int TPB  = 256;
    int blocks = 10; //to get things kicked off
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    while(blocks != 1 ){
        if(size < TPB){
            TPB  = size; // size is 2^sth
        }
        blocks  = (size+ TPB -1 ) / TPB;
        cudaMalloc((void**)&godata, blocks * sizeof(int));
        computeAddShared<<<blocks, TPB,TPB*sizeof(int)>>>(gidata, godata,size);
        //cudaFree(gidata);
        gidata = godata;
        size = blocks;
    }
    //printf("The error by cuda is %s",cudaGetErrorString(cudaGetLastError()));


    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime; 
    cudaEventElapsedTime(&elapsedTime , start, stop);
    printf("time is %f ms", elapsedTime);
    int *output = (int*)malloc(sizeof(int));
    cudaMemcpy(output, gidata,size *  sizeof(int), cudaMemcpyDeviceToHost);
    //Cant free either earlier as both point to same location
    cudaError_t chk = cudaFree(godata);
    if(chk!=0){
        printf("First chk also printed error. Maybe error in my logic\n");
    }

    printf("The error by threadsyn is %s", cudaGetErrorString(cudaGetLastError()));
    printf("The sum of the array is %d\n", output[0]);
    getchar();

    return 0;
}
//仅适用于2^x的数组大小和2^y的TPB,因为godata=块数。但如果先前满足,则块数2^sth
//适用于类型为2^x的任意大小数组
#包括
__全局无效计算共享(int*in、int*out、int-sizeInput){
//未创建参数gidata和godata,以强调参数获取地址副本,并且不同于主机代码中的指针
外部共享浮动温度[];
int tid=blockIdx.x*blockDim.x+threadIdx.x;
int ltid=threadIdx.x;
温度[ltid]=0;
而(tid
正如Talonmes之前所说,内核本身是可以的。基本上,它是CUDA SDK的
reduce0
内核,在将同一SDK的
reduce5
内核改进为
reduce6
时使用的Brent优化意义上的算法级联

下面的测试代码可以显示内核是否正常工作,它还比较了
reduce0
与代码中名为
reduce0\u stackoverflow
的OP内核的性能。
reduce0\u stackoverflow
内核还报告并注释了
reduce0
的相应代码行

对于下面的测试用例,与GeForce GT540M卡上的
0.049ms
相比,
reduce0_stackoverflow
0.030ms
中执行

请注意,下面的代码并不要求数组大小必须是
2的幂

#include <thrust\device_vector.h>

#define BLOCKSIZE 256

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) { getchar(); exit(code); }
    }
}

/*******************************************************/
/* CALCULATING THE NEXT POWER OF 2 OF A CERTAIN NUMBER */
/*******************************************************/
unsigned int nextPow2(unsigned int x)
{
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

/*************************************/
/* CHECK IF A NUMBER IS A POWER OF 2 */
/*************************************/
bool isPow2(unsigned int x)
{
    return ((x&(x-1))==0);
}

/******************/
/* REDUCE0 KERNEL */
/******************/
/* This reduction interleaves which threads are active by using the modulo
    operator.  This operator is very expensive on GPUs, and the interleaved
    inactivity means that no whole warps are active, which is also very
    inefficient */
template <class T>
__global__ void reduce0(T *g_idata, T *g_odata, unsigned int N)
{
    extern __shared__ T sdata[];

    unsigned int tid    = threadIdx.x;                              // Local thread index
    unsigned int i      = blockIdx.x * blockDim.x + threadIdx.x;    // Global thread index

    // --- Loading data to shared memory
    sdata[tid] = (i < N) ? g_idata[i] : 0;

    // --- Before going further, we have to make sure that all the shared memory loads have been completed
    __syncthreads();

    // --- Reduction in shared memory
    for (unsigned int s=1; s < blockDim.x; s *= 2)
    {
        // --- Only the threads with index multiple of 2*s perform additions. Furthermore, modulo arithmetic is slow.       
        if ((tid % (2*s)) == 0) { sdata[tid] += sdata[tid + s]; }
        // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
        __syncthreads();
    }

    // --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
    //     individual blocks
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

/***************************/
/* REDUCE0 - STACKOVERFLOW */
/***************************/
template <class T>
__global__ void reduce0_stackoverflow(T *g_idata , T *g_odata, unsigned int N) {

    extern __shared__ T sdata[];

    unsigned int tid    = threadIdx.x;                              // Local thread index
    unsigned int i      = blockIdx.x * blockDim.x + threadIdx.x;    // Global thread index

    // --- Loading data to shared memory
    //sdata[tid] = (i < N) ? g_idata[i] : 0;                        // CUDA SDK
    sdata[tid] = 0;
    while(i < N){
        sdata[tid] += g_idata[i];
        i+=gridDim.x * blockDim.x; // to handle array of any size
    }

    // --- Before going further, we have to make sure that all the shared memory loads have been completed
    __syncthreads();

    // --- Reduction in shared memory
    //  for (unsigned int s=1; s < blockDim.x; s *= 2)                  // CUDA SDK
    //  {
    //     if ((tid % (2*s)) == 0) { sdata[tid] += sdata[tid + s]; }
    //     __syncthreads();
    //  }
    unsigned int s = 1;
    while(s < blockDim.x) 
    {
        // --- Only the threads with index multiple of 2*s perform additions. Furthermore, modulo arithmetic is slow.       
        if ((tid % (2*s)) == 0) { sdata[tid] += sdata[tid + s]; }

        // --- At the end of each iteration loop, we have to make sure that all memory operations have been completed
        __syncthreads();

        s*=2;
    }

    // --- Write result for this block to global memory. At the end of the kernel, global memory will contain the results for the summations of
    //     individual blocks
    if (tid == 0) g_odata[blockIdx.x] = sdata[0];

}

/********/
/* MAIN */
/********/
int main()
{
    const int N = 15336;

    thrust::device_vector<int> d_vec(N,3);

    int NumThreads  = (N < BLOCKSIZE) ? nextPow2(N) : BLOCKSIZE;
    int NumBlocks   = (N + NumThreads - 1) / NumThreads;

    // when there is only one warp per block, we need to allocate two warps
    // worth of shared memory so that we don't index shared memory out of bounds
    int smemSize = (NumThreads <= 32) ? 2 * NumThreads * sizeof(int) : NumThreads * sizeof(int);

    // --- Creating events for timing
    float time;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    thrust::device_vector<int> d_vec_block(NumBlocks);

    /***********/
    /* REDUCE0 */
    /***********/
    cudaEventRecord(start, 0);
    reduce0<<<NumBlocks, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec.data()), thrust::raw_pointer_cast(d_vec_block.data()), N);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("reduce0 - Elapsed time:  %3.3f ms \n", time);   

    // --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
    thrust::host_vector<int> h_vec_block(d_vec_block);
    int sum_reduce0 = 0;
    for (int i=0; i<NumBlocks; i++) sum_reduce0 = sum_reduce0 + h_vec_block[i];
    printf("Result for reduce0 = %i\n",sum_reduce0);

    /**********************************/
    /* REDUCE0 KERNEL - STACKOVERFLOW */
    /**********************************/
    cudaEventRecord(start, 0);
    reduce0_stackoverflow<<<NumBlocks/2, NumThreads, smemSize>>>(thrust::raw_pointer_cast(d_vec.data()), thrust::raw_pointer_cast(d_vec_block.data()), N);
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("reduce0 - stackoverflow - Elapsed time:  %3.3f ms \n", time);   

    // --- The last part of the reduction, which would be expensive to perform on the device, is executed on the host
    h_vec_block = d_vec_block;
    int sum_reduce0_stackoverflow = 0;
    for (int i=0; i<NumBlocks; i++) sum_reduce0_stackoverflow = sum_reduce0_stackoverflow + h_vec_block[i];
    printf("Result for reduce0_stackoverflow = %i\n",sum_reduce0_stackoverflow);

    getchar();
}
#包括
#定义块大小256
/********************/
/*CUDA错误检查*/
/********************/
#定义gpuerchk(ans){gpuAssert((ans),_文件_,_行__)}
内联void gpuAssert(cudaError\u t代码,char*文件,int行,bool abort=true)
{
如果(代码!=cudaSuccess)
{
fprintf(标准,“GPUassert:%s%s%d\n”,cudaGetErrorString(代码)、文件、行);
if(abort){getchar();exit(code);}
}
}
/*******************************************************/
/*计算某个数的2的次幂*/
/*******************************************************/
无符号整数nextPow2(无符号整数x)
{
--x;
x |=x>>1;
x |=x>>2;
x |=x>>4;
x |=x>>8;
x |=x>>16;
return++x;
}
/*************************************/
/*检查数字是否为2的幂*/
/*************************************/
bool isPow2(无符号整数x)
{
返回((x&(x-1))==0);
}
/******************/
/*REDUCE0核*/
/******************/
/*这种减少通过使用模来交错哪些线程是活动的
操作人员这个操作符在GPU上非常昂贵,并且是交错的
不活动意味着没有整个扭曲处于活动状态,这也是非常重要的
低效*/
模板
__全局无效归约0(T*g\U idata,T*g\U odata,无符号整数N)
{
外部共享数据[];
unsigned int tid=threadIdx.x;