Cuda 求和扫描-如果线程数过多,则结果错误

Cuda 求和扫描-如果线程数过多,则结果错误,cuda,Cuda,受上述和扫描算法实现的启发,我尝试以下方式实现它: #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> __global__ void count_zeros_shared(int N, int M, int* data) { __shared__ unsigned s_offset[1024]; for (int s = threadIdx.x +

受上述和扫描算法实现的启发,我尝试以下方式实现它:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void count_zeros_shared(int N, int M, int* data)
{
    __shared__ unsigned s_offset[1024];

    for (int s = threadIdx.x + blockIdx.x * blockDim.x; s < N; s += blockDim.x * gridDim.x)
    {
        // count non-zero values
        unsigned count = 0;
        for (int j = 0; j < M; ++j)
        {
            if (data[s + N * j] != 0)
                ++count;
        }
        s_offset[threadIdx.x] = count;

        if ((s - threadIdx.x) == 0)
            printf("s_offset [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);

        // reduce offset
        for (int shift = 1; shift < blockDim.x; shift += shift)
        {
            __syncthreads();
            if (threadIdx.x >= shift)
            {
                s_offset[threadIdx.x] += s_offset[threadIdx.x - shift];
            }
        }
        __syncthreads();
        if ((s - threadIdx.x) == 0)
            printf("s_offset_acc [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);
    }
}

#include <cstdlib>

int main(int argc, char* argv[])
{
    int NTH = 1024; // FAULTY case by default to answer request within comments
    if (argc > 1) NTH = ::atoi(argv[1]);

    cudaError_t cuerr;

    int* values;
    int N = 1024 * 48, M = 448;

    cuerr = ::cudaMallocManaged(&values, N*M * sizeof(int)) ;
    if (cuerr != cudaSuccess) return cuerr;

    int count = 0;

    ::srand(42);

    for (int k = 0; k < N*M; ++k)
    {
        if ((rand() % 7) == 0) values[k] = ++count ; 
        else values[k] = 0;
    }

    count_zeros_shared <<< N / NTH, NTH >>> (N, M, values);
    cuerr = ::cudaDeviceSynchronize();
    if (cuerr != cudaSuccess) return cuerr;

    return ::cudaDeviceReset();
}
在GeForce 1060M上执行,使用CUDA 9.0编译,在Windows 10上调试时以cc 6.1为目标

\uuu syncthreads()
例程在适当的位置被调用,但是,自CUDA 9.0以来,其行为可能已经改变。我是不是遗漏了什么

补充信息 linux上的编译:

$> nvcc -G main.cu -arch sm_61
在GeForce GTX 1080上运行的结果与此类似

但是,

$> nvcc main.cu -arch sm_61

结果看起来不错。

在这一点上存在竞争条件:

    for (int shift = 1; shift < blockDim.x; shift += shift)
    {
        __syncthreads();
        if (threadIdx.x >= shift)
        {
            s_offset[threadIdx.x] += s_offset[threadIdx.x - shift];
            ^^^^^^                   ^^^^^^
            write op                 read op
我确信这不是“最佳的”,但整个练习不是“最佳的”。如果需要快速扫描,请使用或

回答评论中的一个问题:我同意使用或不使用
-G
(设备调试代码生成)开关编译似乎会影响
cuda memcheck--tool racecheck…
是否报告危险。在linux CUDA 9.0、GTX960(cc5.2)上,我使用原始提供代码的一个次要变体编写了以下测试用例:

$ cat t271.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void count_zeros_shared(int N, int M, int* data)
{
    __shared__ unsigned s_offset[1024];
#ifdef FIX
    __shared__ unsigned s_offset2[1024];
#endif
    for (int s = threadIdx.x + blockIdx.x * blockDim.x; s < N; s += blockDim.x * gridDim.x)
    {
        // count non-zero values
        unsigned count = 0;
        for (int j = 0; j < M; ++j)
        {
            if (data[s + N * j] != 0)
                ++count;
        }
        s_offset[threadIdx.x] = count;

//        if ((s - threadIdx.x) == 0)
        if (s == 1023)
            printf("s_offset [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);

        // reduce offset
#ifndef FIX
        for (int shift = 1; shift < blockDim.x; shift += shift)
        {
            __syncthreads();
            if (threadIdx.x >= shift)
            {
                s_offset[threadIdx.x] += s_offset[threadIdx.x - shift];  //line 34
            }
        }
        __syncthreads();
#else
        for (int shift = 1; shift < blockDim.x; shift += shift)
        {
            s_offset2[threadIdx.x] = s_offset[threadIdx.x];
            __syncthreads();
            if (threadIdx.x >= shift)
            {
                s_offset[threadIdx.x] += s_offset2[threadIdx.x - shift];
            }
            __syncthreads();
        }
#endif
//        if ((s - threadIdx.x) == 0)
        if (s == 1023)
            printf("s_offset_acc [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);
    }
}

#include <cstdlib>

int main(int argc, char* argv[])
{
    int NTH = 128;
    if (argc > 1) NTH = ::atoi(argv[1]);

    cudaError_t cuerr;

    int* values;
    int N = 1024 * 48, M = 448;

    cuerr = ::cudaMallocManaged(&values, N*M * sizeof(int)) ;
    if (cuerr != cudaSuccess) return cuerr;

    int count = 0;

    ::srand(42);

    for (int k = 0; k < N*M; ++k)
    {
        if ((rand() % 7) == 0) values[k] = ++count ;
        else values[k] = 0;
    }

    printf("count = %d\n", count);
    count_zeros_shared <<< N / NTH, NTH >>> (N, M, values);
    cuerr = ::cudaDeviceSynchronize();
    if (cuerr != cudaSuccess) return cuerr;

    return ::cudaDeviceReset();
}
$ nvcc -G -arch=sm_52 -o t271 t271.cu
$ cuda-memcheck --tool racecheck ./t271 1024
========= CUDA-MEMCHECK
count = 3145571
s_offset [1023] = 73
s_offset_acc [1023] = 65571
========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
$ nvcc -lineinfo -arch=sm_52 -o t271 t271.cu
$ cuda-memcheck --tool racecheck ./t271 1024
========= CUDA-MEMCHECK
count = 3145571
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [236072 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [1992 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [2369 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [232728 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [913 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [233479 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [1841 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [239007 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [1833 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [228636 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [1689 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [225456 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [2177 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [151696 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [1009 hazards]
=========
s_offset [1023] = 73
s_offset_acc [1023] = 65571
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [8064 hazards]
=========
========= RACECHECK SUMMARY: 16 hazards displayed (16 errors, 0 warnings)
$
$cat t271.cu
#包括“cuda_runtime.h”
#包括“设备启动参数.h”
#包括
__全局\uuuuu无效计数\u零\u共享(int N,int M,int*数据)
{
__共享的无符号s_偏移量[1024];
#ifdef修复
__共享的未签名的s_偏移量2[1024];
#恩迪夫
对于(int s=threadIdx.x+blockIdx.x*blockDim.x;s=shift)
{
s_偏移量[threadIdx.x]+=s_偏移量[threadIdx.x-shift];//第34行
}
}
__同步线程();
#否则
对于(int-shift=1;shift=shift)
{
s_偏移量[threadIdx.x]+=s_偏移量2[threadIdx.x-shift];
}
__同步线程();
}
#恩迪夫
//如果((s-threadIdx.x)==0)
如果(s==1023)
printf(“s_offset_acc[%d]=%d\n”,threadIdx.x,s_offset[threadIdx.x]);
}
}
#包括
int main(int argc,char*argv[])
{
int n=128;
如果(argc>1)n=::atoi(argv[1]);
cudaError_t cuerr;
int*值;
int N=1024*48,M=448;
cuerr=::cudaMallocManaged(&value,N*M*sizeof(int));
如果(cuerr!=cudaSuccess)返回cuerr;
整数计数=0;
::srand(42);
对于(int k=0;k(N,M,值);
cuerr=::cudaDeviceSynchronize();
如果(cuerr!=cudaSuccess)返回cuerr;
return::cudaDeviceReset();
}
$nvcc-G-arch=sm_52-o t271.cu
$cuda memcheck——工具线检查。/t271 1024
==========CUDA-MEMCHECK
计数=3145571
s_偏移量[1023]=73
s_offset_acc[1023]=65571
=========比赛检查摘要:显示0个危险(0个错误,0个警告)
$nvcc-lineinfo-arch=sm_52-o t271.cu
$cuda memcheck——工具线检查。/t271 1024
==========CUDA-MEMCHECK
计数=3145571
======错误:在/home/bob/misc/t271中0x00000bb0处的读取访问之间报告竞争。cu:34:计数零共享(int,int,int*)
=========在0x00000bc8 in/home/bob/misc/t271处进行和写访问。cu:34:计数零共享(int,int,int*)[236072]
=========
======错误:在/home/bob/misc/t271中0x00000bb0处的读取访问之间报告竞争。cu:34:计数零共享(int,int,int*)
==========在0x00000bc8 in/home/bob/misc/t271处进行写访问。cu:34:count_zero_shared(int,int,int*)[1992]
=========
======错误:在/home/bob/misc/t271中0x00000bb0处的读取访问之间报告竞争。cu:34:计数零共享(int,int,int*)
=========在0x00000bc8 in/home/bob/misc/t271处进行和写访问。cu:34:计数零共享(int,int,int*)[2369]
=========
======错误:在/home/bob/misc/t271中0x00000bb0处的读取访问之间报告竞争。cu:34:计数零共享(int,int,int*)
=========在0x00000bc8 in/home/bob/misc/t271处进行写入访问。cu:34:计数零共享(int,int,int*)[232728]
=========
======错误:在/home/bob/misc/t271中0x00000bb0处的读取访问之间报告竞争。cu:34:计数零共享(int,int,int*)
=========在0x00000bc8 in/home/bob/misc/t271处进行写入访问。cu:34:计数零共享(int,int,int*)[913]
=========
======错误:在/home/bob/misc/t271中0x00000bb0处的读取访问之间报告竞争。cu:34:计数零共享(int,int,int*)
=========在0x00000bc8 in/home/bob/misc/t271处进行和写访问。cu:34:计数零共享(int,int,int*)[233479]
=========
======错误:在/home/bob/misc/t271中0x00000bb0处的读取访问之间报告竞争。cu:34:计数零共享(int,int,int*)
=======在0x00000bc8 in/home/bob/misc/t271处进行写入访问。cu:34:count_zero_shared(int,int,int*)[1841]
=========
======错误:在/home/bob/misc/t271中0x00000bb0处的读取访问之间报告竞争。cu:34:计数零共享(int,int,int*)
=========在0x00000bc8 in/home/bob/misc/t271处进行写入访问。cu:34:计数零共享(int,int,int*)[239007]
=========
======错误:在/home/bob/misc/t271中0x00000bb0处的读取访问之间报告竞争。cu:34:计数零共享(int,int,int*)
=========在0x00000bc8 in/home/bob/misc/t271处进行和写访问。cu:34:count_
$ cat t271.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void count_zeros_shared(int N, int M, int* data)
{
    __shared__ unsigned s_offset[1024];
    __shared__ unsigned s_offset2[1024];

    for (int s = threadIdx.x + blockIdx.x * blockDim.x; s < N; s += blockDim.x * gridDim.x)
    {
        // count non-zero values
        unsigned count = 0;
        for (int j = 0; j < M; ++j)
        {
            if (data[s + N * j] != 0)
                ++count;
        }
        s_offset[threadIdx.x] = count;

//        if ((s - threadIdx.x) == 0)
        if (s == 1023)
            printf("s_offset [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);

        // reduce offset
        for (int shift = 1; shift < blockDim.x; shift += shift)
        {
            s_offset2[threadIdx.x] = s_offset[threadIdx.x];
            __syncthreads();
            if (threadIdx.x >= shift)
            {
                s_offset[threadIdx.x] += s_offset2[threadIdx.x - shift];
            }
            __syncthreads();
        }
//        if ((s - threadIdx.x) == 0)
        if (s == 1023)
            printf("s_offset_acc [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);
    }
}

#include <cstdlib>

int main(int argc, char* argv[])
{
    int NTH = 128;
    if (argc > 1) NTH = ::atoi(argv[1]);

    cudaError_t cuerr;

    int* values;
    int N = 1024 * 48, M = 448;

    cuerr = ::cudaMallocManaged(&values, N*M * sizeof(int)) ;
    if (cuerr != cudaSuccess) return cuerr;

    int count = 0;

    ::srand(42);

    for (int k = 0; k < N*M; ++k)
    {
        if ((rand() % 7) == 0) values[k] = ++count ;
        else values[k] = 0;
    }

    printf("count = %d\n", count);
    count_zeros_shared <<< N / NTH, NTH >>> (N, M, values);
    cuerr = ::cudaDeviceSynchronize();
    if (cuerr != cudaSuccess) return cuerr;

    return ::cudaDeviceReset();
}
$ nvcc -arch=sm_52 -o t271 t271.cu
$ cuda-memcheck ./t271 1024
========= CUDA-MEMCHECK
count = 3145571
s_offset [1023] = 73
s_offset_acc [1023] = 65571
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck --tool racecheck ./t271 1024
========= CUDA-MEMCHECK
count = 3145571
s_offset [1023] = 73
s_offset_acc [1023] = 65571
========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
$
$ cat t271.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void count_zeros_shared(int N, int M, int* data)
{
    __shared__ unsigned s_offset[1024];
#ifdef FIX
    __shared__ unsigned s_offset2[1024];
#endif
    for (int s = threadIdx.x + blockIdx.x * blockDim.x; s < N; s += blockDim.x * gridDim.x)
    {
        // count non-zero values
        unsigned count = 0;
        for (int j = 0; j < M; ++j)
        {
            if (data[s + N * j] != 0)
                ++count;
        }
        s_offset[threadIdx.x] = count;

//        if ((s - threadIdx.x) == 0)
        if (s == 1023)
            printf("s_offset [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);

        // reduce offset
#ifndef FIX
        for (int shift = 1; shift < blockDim.x; shift += shift)
        {
            __syncthreads();
            if (threadIdx.x >= shift)
            {
                s_offset[threadIdx.x] += s_offset[threadIdx.x - shift];  //line 34
            }
        }
        __syncthreads();
#else
        for (int shift = 1; shift < blockDim.x; shift += shift)
        {
            s_offset2[threadIdx.x] = s_offset[threadIdx.x];
            __syncthreads();
            if (threadIdx.x >= shift)
            {
                s_offset[threadIdx.x] += s_offset2[threadIdx.x - shift];
            }
            __syncthreads();
        }
#endif
//        if ((s - threadIdx.x) == 0)
        if (s == 1023)
            printf("s_offset_acc [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);
    }
}

#include <cstdlib>

int main(int argc, char* argv[])
{
    int NTH = 128;
    if (argc > 1) NTH = ::atoi(argv[1]);

    cudaError_t cuerr;

    int* values;
    int N = 1024 * 48, M = 448;

    cuerr = ::cudaMallocManaged(&values, N*M * sizeof(int)) ;
    if (cuerr != cudaSuccess) return cuerr;

    int count = 0;

    ::srand(42);

    for (int k = 0; k < N*M; ++k)
    {
        if ((rand() % 7) == 0) values[k] = ++count ;
        else values[k] = 0;
    }

    printf("count = %d\n", count);
    count_zeros_shared <<< N / NTH, NTH >>> (N, M, values);
    cuerr = ::cudaDeviceSynchronize();
    if (cuerr != cudaSuccess) return cuerr;

    return ::cudaDeviceReset();
}
$ nvcc -G -arch=sm_52 -o t271 t271.cu
$ cuda-memcheck --tool racecheck ./t271 1024
========= CUDA-MEMCHECK
count = 3145571
s_offset [1023] = 73
s_offset_acc [1023] = 65571
========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
$ nvcc -lineinfo -arch=sm_52 -o t271 t271.cu
$ cuda-memcheck --tool racecheck ./t271 1024
========= CUDA-MEMCHECK
count = 3145571
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [236072 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [1992 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [2369 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [232728 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [913 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [233479 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [1841 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [239007 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [1833 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [228636 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [1689 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [225456 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [2177 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [151696 hazards]
=========
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [1009 hazards]
=========
s_offset [1023] = 73
s_offset_acc [1023] = 65571
========= ERROR: Race reported between Read access at 0x00000bb0 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*)
=========     and Write access at 0x00000bc8 in /home/bob/misc/t271.cu:34:count_zeros_shared(int, int, int*) [8064 hazards]
=========
========= RACECHECK SUMMARY: 16 hazards displayed (16 errors, 0 warnings)
$
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

__global__ void init_kernel(unsigned* totalcount)
{
        totalcount [threadIdx.x] = 0;
}

#define TRACE 0

__global__ void count_zeros_shared(int N, int M, int* data, unsigned* totalcount)
{
        __shared__ unsigned s_offset[1024];

        for (int s = threadIdx.x + blockIdx.x * blockDim.x; s < N; s += blockDim.x * gridDim.x)
        {
                // count non-zero values
                unsigned count = 0;
                for (int j = 0; j < M; ++j)
                {
                        if (data[s + N * j] != 0)
                                ++count;
                }
                s_offset[threadIdx.x] = count;

                #if TRACE
                if ((s - threadIdx.x) == 0)
                        printf("s_offset [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);
                #endif

                // reduce offset
                for (int shift = 1; shift < blockDim.x; shift += shift)
                {
                        __syncthreads();

                        #if 0 // race condition version

                        if (threadIdx.x >= shift)
                        {
                                s_offset[threadIdx.x] += s_offset[threadIdx.x - shift];
                        }

                        #else

                        int val = s_offset[threadIdx.x];
                        if (threadIdx.x >= shift)
                        {
                                val += s_offset[threadIdx.x - shift] ;
                        }
                        __syncthreads();
                        if (threadIdx.x >= shift)
                        {
                                s_offset[threadIdx.x] = val ;
                        }

                        #endif
                }
                __syncthreads();

                #if TRACE
                if ((s - threadIdx.x) == 0)
                        printf("s_offset_acc [%d] = %d\n", threadIdx.x, s_offset[threadIdx.x]);
                #endif

                if (threadIdx.x == 0)
                        atomicAdd(totalcount, s_offset[blockDim.x - 1]);

                __syncthreads();
        }

}

__global__ void printsum_kernel(int N, unsigned* totalcount)
{
        for (int NTH = 32 ; NTH <= 1024 ; NTH *= 2)
                printf("GPU TOTAL COUNT [BlockDIM = %d] = %d\n", NTH, totalcount[(NTH / 32) - 1]);
}


#include <cstdlib>

int main(int argc, char* argv[])
{
        cudaError_t cuerr;

        int* values;
        unsigned* totalcount;
        int N = 1024 * 48, M = 448;

        cuerr = ::cudaMalloc(&totalcount, (1024/32) * sizeof(unsigned)) ;
        if (cuerr != cudaSuccess) return cuerr;

        cuerr = ::cudaMallocManaged(&values, N*M * sizeof(int)) ;
        if (cuerr != cudaSuccess) return cuerr;

        int count = 0;

        ::srand(42);

        for (int k = 0; k < N*M; ++k)
        {
                if ((rand() % 7) == 0) values[k] = ++count ;
                else values[k] = 0;
        }

        init_kernel << < 1, 1024/32 >> > (totalcount);
        cuerr = ::cudaDeviceSynchronize();
        if (cuerr != cudaSuccess) return cuerr;

        for (int NTH = 32; NTH <= 1024; NTH *= 2)
        {
                printf("RUNNING %d threads per block\n", NTH);

                count_zeros_shared << < N / NTH, NTH >> > (N, M, values, totalcount + ((NTH / 32) - 1));
                cuerr = ::cudaDeviceSynchronize();
                if (cuerr != cudaSuccess) return cuerr;
        }

        printsum_kernel << < 1, 1 >> > (1024/32, totalcount);
        cuerr = ::cudaDeviceSynchronize();
        if (cuerr != cudaSuccess) return cuerr;

        printf("GROUND TRUTH TOTAL COUNT = %d\n", count);

        return ::cudaDeviceReset();
}
$> nvcc -G main3.cu -arch sm_61 -o a3.out
$>  cuda-memcheck --version
CUDA-MEMCHECK version 9.0.176 ID:(44)
$> cuda-memcheck --tool racecheck ./a3.out
========= CUDA-MEMCHECK
RUNNING 32 threads per block
RUNNING 64 threads per block
RUNNING 128 threads per block
RUNNING 256 threads per block
RUNNING 512 threads per block
RUNNING 1024 threads per block
GPU TOTAL COUNT [BlockDIM = 32] = 3145571
GPU TOTAL COUNT [BlockDIM = 64] = 3145571
GPU TOTAL COUNT [BlockDIM = 128] = 3145571
GPU TOTAL COUNT [BlockDIM = 256] = 3145571
GPU TOTAL COUNT [BlockDIM = 512] = 3145571
GPU TOTAL COUNT [BlockDIM = 1024] = 3145571
GROUND TRUTH TOTAL COUNT = 3145571
========= RACECHECK SUMMARY: 0 hazards displayed (0 errors, 0 warnings)
$> ./a3.out
RUNNING 32 threads per block
RUNNING 64 threads per block
RUNNING 128 threads per block
RUNNING 256 threads per block
RUNNING 512 threads per block
RUNNING 1024 threads per block
GPU TOTAL COUNT [BlockDIM = 32] = 3145571
GPU TOTAL COUNT [BlockDIM = 64] = 3145571
GPU TOTAL COUNT [BlockDIM = 128] = 3161857
GPU TOTAL COUNT [BlockDIM = 256] = 3200816
GPU TOTAL COUNT [BlockDIM = 512] = 3231303
GPU TOTAL COUNT [BlockDIM = 1024] = 3925122
GROUND TRUTH TOTAL COUNT = 3145571

$> nvidia-smi
Fri Jan  5 18:29:33 2018
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.98                 Driver Version: 384.98                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  GeForce GTX 1080    Off  | 00000000:02:00.0 Off |                  N/A |
| 29%   44C    P0    39W / 180W |      0MiB /  8112MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+