不适用于N的两个矢量的cuda点积>=369_Cuda_Dot Product

不适用于N的两个矢量的cuda点积>=369

cuda

不适用于N的两个矢量的cuda点积>=369,cuda,dot-product,Cuda,Dot Product,我实现了一个向量点积，如下所示。它使用CUDA 7.5编译，具有compute\u 20、sm\u 20和const int THREADS\u PER\u BLOCK=16 浮球和双打都是如此 int divUp(int total, int grain) { return (total+grain-1)/grain; } __device__ __forceinline__ double atomicAdd(double* address, double val) { unsig

我实现了一个向量点积，如下所示。它使用CUDA 7.5编译，具有

compute\u 20、sm\u 20

和

const int THREADS\u PER\u BLOCK=16
浮球和双打都是如此
int divUp(int total, int grain) { return (total+grain-1)/grain; }

__device__ __forceinline__ double atomicAdd(double* address, double val)
{
    unsigned long long int* address_as_ull = (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do
    {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
    }
    while(assumed!=old);
    return __longlong_as_double(old);
}

__device__ __forceinline__ float atomicAdd(float* address, float val)
{
    unsigned int *ptr = (unsigned int *)address;
    unsigned int old, newint, ret = *ptr;
    do {
        old = ret;
        newint = __float_as_int(__int_as_float(old)+val);
    } while((ret = atomicCAS(ptr, old, newint)) != old);

    return __int_as_float(ret);
}

template<typename T>
__global__ void vecdotk(const T* a, const T* b, const int n, T* c)
{
    __shared__ T temp[THREADS_PER_BLOCK];
    int x = threadIdx.x+blockIdx.x*blockDim.x;
    if(x==0) c[0] = 0.0;
    if(x<n) {temp[threadIdx.x] = a[x]*b[x];
    }
    else temp[threadIdx.x] = 0.0;
    __syncthreads();

    if(0==threadIdx.x)
    {
        T sum = 0.0;
        for(int j=0; j<THREADS_PER_BLOCK; ++j)
        {
            sum += temp[j];
        }
        atomicAdd(c, sum);
    }
}

template<typename T>
void dot(const T* a, const T* b, const int n, T* c)
{
    dim3 block(THREADS_PER_BLOCK);
    dim3 grid(divUp(n, block.x), 1);
    vecdotk<T><<<grid, block>>>(a, b, n, c);
    cudaSafeCall(cudaGetLastError());
};

它最多可工作到n=368
，但除此之外，结果是不正确的。我想知道问题是出在我的实现代码上还是出在我使用的值上（请参见第二个代码，初始化），例如，可能是超出n=368的加法引入了浮点错误（这可能很奇怪，因为浮点和双精度都发生了相同的错误）
int divUp（int-total，int-grain）{return（total+grain-1）/grain；}
__设备\强制内联\双原子添加（双*地址，双val）
{
无符号长整型*地址作为（无符号长整型*）地址；
无符号long long int old=*假定地址为ull；
做
{
假定=旧；
old=atomicCAS（地址为ull，假设为，地址为double，地址为longlong（val+，地址为double，假设为））；
}
while（假定的！=旧的）；
返回uuu longlong_u作为u double（旧）；
}
__设备\uuuuuuuuuu强制内联\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu
{
无符号整数*ptr=（无符号整数*）地址；
unsigned int old，newint，ret=*ptr；
做{
old=ret；
newint=uuu float_as_int（uu int_as_float（old）+val）；
}而（（ret=atomicCAS（ptr，old，newint））！=old）；
返回作为浮点数的整数（ret）；
}
模板
__全局无效向量点（常量T*a、常量T*b、常量int n、常量T*c）
{
__共享线程温度[每个线程块的线程数]；
int x=threadIdx.x+blockIdx.x*blockDim.x；
如果（x==0）c[0]=0.0；
如果（x这不起作用：
if(x==0) c[0] = 0.0; 

无法保证（在CUDA中）线程0首先运行，或者在其他线程到达代码中的任何一点之前，这一行将运行。在启动此内核之前，您需要初始化c[0]
。否则，一些线程可能会将其原子添加到c，然后，稍后，线程0可能会将c[0]初始化为零
此外，CUDA已经提供了atomicAdd的float
版本，您没有理由提供自己的版本。而且，运行16个线程的ThreadBlock不会给您带来好的性能（我建议只使用CUBLAS dot product函数。）对于c[0]
（删除该行代码，并初始化c[0]
在内核之前）您的代码对我来说运行正常：
$ cat t372.cu
#include <stdio.h>

const int n = 2048;
#ifdef USE_DOUBLE
typedef double mytype;
#else
typedef float mytype;
#endif
const int THREADS_PER_BLOCK = 16;

int divUp(int total, int grain) { return (total+grain-1)/grain; }
#if 0
__device__ __forceinline__ double atomicAdd(double* address, double val)
{
    unsigned long long int* address_as_ull = (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do
    {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
    }
    while(assumed!=old);
    return __longlong_as_double(old);
}

__device__ __forceinline__ float atomicAdd(float* address, float val)
{
    unsigned int *ptr = (unsigned int *)address;
    unsigned int old, newint, ret = *ptr;
    do {
        old = ret;
        newint = __float_as_int(__int_as_float(old)+val);
    } while((ret = atomicCAS(ptr, old, newint)) != old);

    return __int_as_float(ret);
}
#endif
template<typename T>
__global__ void vecdotk(const T* a, const T* b, const int n, T* c)
{
    __shared__ T temp[THREADS_PER_BLOCK];
    int x = threadIdx.x+blockIdx.x*blockDim.x;
    //if(x==0) c[0] = 0.0;
    if(x<n) {temp[threadIdx.x] = a[x]*b[x];
    }
    else temp[threadIdx.x] = 0.0;
    __syncthreads();

    if(0==threadIdx.x)
    {
        T sum = 0.0;
        for(int j=0; j<THREADS_PER_BLOCK; ++j)
        {
            sum += temp[j];
        }
        atomicAdd(c, sum);
    }
}

template<typename T>
cudaError_t dot(const T* a, const T* b, const int n, T* c)
{
    dim3 block(THREADS_PER_BLOCK);
    dim3 grid(divUp(n, block.x), 1);
    vecdotk<T><<<grid, block>>>(a, b, n, c);
    cudaDeviceSynchronize();
    return cudaGetLastError();
};

int main(){

  mytype *h_vec_a, *h_vec_b, *d_vec_a, *d_vec_b, *h_c, *d_c;
  int bs = n*sizeof(mytype);
  h_vec_a = (mytype *)malloc(bs);
  h_vec_b = (mytype *)malloc(bs);
  h_c = (mytype *)malloc(sizeof(mytype));
  cudaMalloc(&d_vec_b, bs);
  cudaMalloc(&d_vec_a, bs);
  cudaMalloc(&d_c, sizeof(mytype));
// fill host vectors a and b
  for(int i=0; i<n; ++i)
  {
    h_vec_a[i] = i+1;//__mat_rand();
    h_vec_b[i] = i+1;//__mat_rand();
  }
  h_c[0] = 0;
  cudaMemcpy(d_vec_a, h_vec_a, bs, cudaMemcpyHostToDevice);
  cudaMemcpy(d_vec_b, h_vec_b, bs, cudaMemcpyHostToDevice);
  cudaMemcpy(d_c, h_c, sizeof(mytype), cudaMemcpyHostToDevice);
  dot(d_vec_a, d_vec_b, n, d_c);
  cudaMemcpy(h_c, d_c, sizeof(mytype), cudaMemcpyDeviceToHost);
  mytype test_val = 0;
  for (int i=0; i < n; i++)
    test_val += h_vec_a[i] * h_vec_b[i];
  printf("GPU result: %f, CPU result: %f\n", h_c[0], test_val);

}
$ nvcc -arch=sm_20 -o t372 t372.cu
nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
$ cuda-memcheck ./t372
========= CUDA-MEMCHECK
GPU result: 2865411584.000000, CPU result: 2865411072.000000
========= ERROR SUMMARY: 0 errors
$

$cat t372.cu
#包括
常数int n=2048；
#ifdef使用双
typedef双mytype；
#否则
typedef-float-mytype；
#恩迪夫
const int THREADS_PER_BLOCK=16；
整数分割（整数总计，整数粒度）{return（总计+粒度-1）/grain；}
#如果0
__设备\强制内联\双原子添加（双*地址，双val）
{
无符号长整型*地址作为（无符号长整型*）地址；
无符号long long int old=*假定地址为ull；
做
{
假定=旧；
old=atomicCAS（地址为ull，假设为，地址为double，地址为longlong（val+，地址为double，假设为））；
}
while（假定的！=旧的）；
返回uuu longlong_u作为u double（旧）；
}
__设备\uuuuuuuuuu强制内联\uuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuu
{
无符号整数*ptr=（无符号整数*）地址；
unsigned int old，newint，ret=*ptr；
做{
old=ret；
newint=uuu float_as_int（uu int_as_float（old）+val）；
}而（（ret=atomicCAS（ptr，old，newint））！=old）；
返回作为浮点数的整数（ret）；
}
#恩迪夫
模板
__全局无效向量点（常量T*a、常量T*b、常量int n、常量T*c）
{
__共享线程温度[每个线程块的线程数]；
int x=threadIdx.x+blockIdx.x*blockDim.x；
//如果（x==0）c[0]=0.0；
if（xI正在学习CUDA和通用计算GPU编程的基础知识。但是，如果我在if（x==0）c[0]=0.0；
行之后使用\uu threadfence（）
会怎么样呢？谢谢Robert。另外，我认为最好在空点（…）中初始化sum=0.0仅函数。对吗？threadfence不强制执行线程执行顺序。是的，只要在内核调用之前，您可以在任何地方初始化c
$ cat t372.cu
#include <stdio.h>

const int n = 2048;
#ifdef USE_DOUBLE
typedef double mytype;
#else
typedef float mytype;
#endif
const int THREADS_PER_BLOCK = 16;

int divUp(int total, int grain) { return (total+grain-1)/grain; }
#if 0
__device__ __forceinline__ double atomicAdd(double* address, double val)
{
    unsigned long long int* address_as_ull = (unsigned long long int*)address;
    unsigned long long int old = *address_as_ull, assumed;
    do
    {
        assumed = old;
        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val+__longlong_as_double(assumed)));
    }
    while(assumed!=old);
    return __longlong_as_double(old);
}

__device__ __forceinline__ float atomicAdd(float* address, float val)
{
    unsigned int *ptr = (unsigned int *)address;
    unsigned int old, newint, ret = *ptr;
    do {
        old = ret;
        newint = __float_as_int(__int_as_float(old)+val);
    } while((ret = atomicCAS(ptr, old, newint)) != old);

    return __int_as_float(ret);
}
#endif
template<typename T>
__global__ void vecdotk(const T* a, const T* b, const int n, T* c)
{
    __shared__ T temp[THREADS_PER_BLOCK];
    int x = threadIdx.x+blockIdx.x*blockDim.x;
    //if(x==0) c[0] = 0.0;
    if(x<n) {temp[threadIdx.x] = a[x]*b[x];
    }
    else temp[threadIdx.x] = 0.0;
    __syncthreads();

    if(0==threadIdx.x)
    {
        T sum = 0.0;
        for(int j=0; j<THREADS_PER_BLOCK; ++j)
        {
            sum += temp[j];
        }
        atomicAdd(c, sum);
    }
}

template<typename T>
cudaError_t dot(const T* a, const T* b, const int n, T* c)
{
    dim3 block(THREADS_PER_BLOCK);
    dim3 grid(divUp(n, block.x), 1);
    vecdotk<T><<<grid, block>>>(a, b, n, c);
    cudaDeviceSynchronize();
    return cudaGetLastError();
};

int main(){

  mytype *h_vec_a, *h_vec_b, *d_vec_a, *d_vec_b, *h_c, *d_c;
  int bs = n*sizeof(mytype);
  h_vec_a = (mytype *)malloc(bs);
  h_vec_b = (mytype *)malloc(bs);
  h_c = (mytype *)malloc(sizeof(mytype));
  cudaMalloc(&d_vec_b, bs);
  cudaMalloc(&d_vec_a, bs);
  cudaMalloc(&d_c, sizeof(mytype));
// fill host vectors a and b
  for(int i=0; i<n; ++i)
  {
    h_vec_a[i] = i+1;//__mat_rand();
    h_vec_b[i] = i+1;//__mat_rand();
  }
  h_c[0] = 0;
  cudaMemcpy(d_vec_a, h_vec_a, bs, cudaMemcpyHostToDevice);
  cudaMemcpy(d_vec_b, h_vec_b, bs, cudaMemcpyHostToDevice);
  cudaMemcpy(d_c, h_c, sizeof(mytype), cudaMemcpyHostToDevice);
  dot(d_vec_a, d_vec_b, n, d_c);
  cudaMemcpy(h_c, d_c, sizeof(mytype), cudaMemcpyDeviceToHost);
  mytype test_val = 0;
  for (int i=0; i < n; i++)
    test_val += h_vec_a[i] * h_vec_b[i];
  printf("GPU result: %f, CPU result: %f\n", h_c[0], test_val);

}
$ nvcc -arch=sm_20 -o t372 t372.cu
nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
$ cuda-memcheck ./t372
========= CUDA-MEMCHECK
GPU result: 2865411584.000000, CPU result: 2865411072.000000
========= ERROR SUMMARY: 0 errors
$