带C Cuda的无for循环的点积

带C Cuda的无for循环的点积,cuda,Cuda,我正在尝试编写c-cuda代码,以实现在内核中没有for循环的点积。以下代码将分别填充有10和15的输入向量分片到对应的共享浮点数组s_in1和s_in2中;这些数组的每个元素之间的乘法结果存储到共享浮点数组块中。对于大小为32000(inputLength=32000)的输入数组,结果是正确的(4'800'000),但是对于大小为320000(inputLength=320000)的数组,结果是错误的(48'192'608而不是48'000'000)。为什么?即使我使用可变浮点块而不是共享数组

我正在尝试编写c-cuda代码,以实现在内核中没有for循环的点积。以下代码将分别填充有10和15的输入向量分片到对应的共享浮点数组s_in1和s_in2中;这些数组的每个元素之间的乘法结果存储到共享浮点数组块中。对于大小为32000(inputLength=32000)的输入数组,结果是正确的(4'800'000),但是对于大小为320000(inputLength=320000)的数组,结果是错误的(48'192'608而不是48'000'000)。为什么?即使我使用可变浮点块而不是共享数组重写代码,也会出现同样的问题。每次执行代码时,结果总是相同的。提前感谢您的帮助

我在Jetson TX1-CUDA 7.0上编译代码,代码如下:

nvcc mycode.cu -o mycode
这是完整的代码:

#define THREADS_PER_BLOCK 1000

__global__ void scalar_prod(float *in1, float *in2, float *out) 
{

__shared__ float block[THREADS_PER_BLOCK];
__shared__ float s_in1[THREADS_PER_BLOCK];
__shared__ float s_in2[THREADS_PER_BLOCK];

unsigned int xIndex = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];

block[threadIdx.x] =  s_in1[threadIdx.x] * s_in2[threadIdx.x];
__syncthreads();
atomicAdd(out, block[threadIdx.x]);
}

int main()
{

int inputLength=320000;
float *hostInput1;
float *hostInput2;
float  hostOutput=0;
float *deviceInput1;
float *deviceInput2;
float *deviceOutput;
unsigned int i;

hostInput1=(float*) malloc(inputLength*sizeof(float));
hostInput2=(float*) malloc(inputLength*sizeof(float));

for(i=0;i<inputLength;++i)
{
  hostInput1[i]=10;
  hostInput2[i]=15;
}

cudaMalloc((void **)&deviceInput1, inputLength * sizeof(float));
cudaMalloc((void **)&deviceInput2, inputLength * sizeof(float));
cudaMalloc((void **)&deviceOutput, sizeof(float));

cudaMemcpy(deviceInput1, hostInput1, inputLength * 
sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(deviceInput2, hostInput2, inputLength * 
sizeof(float),cudaMemcpyHostToDevice);

dim3 blockDim(THREADS_PER_BLOCK);
dim3 gridDim(ceil(inputLength/THREADS_PER_BLOCK));

scalar_prod<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput);

cudaDeviceSynchronize();

cudaMemcpy(&hostOutput, deviceOutput,sizeof(float), cudaMemcpyDeviceToHost);

printf("\n result:%f \n",hostOutput);

cudaFree(deviceInput1);
cudaFree(deviceInput2);
cudaFree(deviceOutput);
free(hostInput1);
free(hostInput2); 
return 0;     
}
#为每个块1000定义线程
__全局无效标量产品(浮点*in1,浮点*in2,浮点*out)
{
__共享浮点块[每个块的线程数];
__共享_uuuu浮点s_in1[每个_块的线程数];
__共享\uuuuu2浮动s\u[每个\u块的线程数];
unsigned int xIndex=blockIdx.x*每个块的线程数+threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];
块[threadIdx.x]=s_in1[threadIdx.x]*s_in2[threadIdx.x];
__同步线程();
atomicAdd(out,block[threadIdx.x]);
}
int main()
{
int输入长度=320000;
浮点*主机输入1;
浮点*主机输入2;
浮点输出=0;
浮动*设备输入1;
浮动*设备输入2;
浮动*设备输出;
无符号整数i;
hostInput1=(float*)malloc(inputLength*sizeof(float));
hostInput2=(float*)malloc(inputLength*sizeof(float));

对于(i=0;i而言,代码至少存在两个问题:

  • 在开始对设备执行
    atomicAdd
    操作之前,您没有初始化设备输出所指向的存储。因此初始值未定义

  • 您超出了
    float
    算术的能力

  • 第1项的修复非常简单-我们可以在运行内核之前轻松地将其初始化为零。对于第2项,一个简单的“修复”将所有内容从
    float
    切换到
    double
    。但是,在您的Jetson GPU上,我们没有方便的
    atomicAdd
    内置的
    double
    值,但是它为我们提供了一个使用
    atomicCAS
    的可能实现。如果我们将这些功能结合起来,我们可以得到一个工作正常的代码:

    $ cat t122.cu
    #include <stdio.h>
    #define THREADS_PER_BLOCK 1000
    
    #ifdef USE_DOUBLE
    typedef double mytype;
    #else
    typedef float mytype;
    #endif
    
    __device__ double my_atomicAdd(double* address, double val) {
     unsigned long long int* address_as_ull = (unsigned long long int*)address;
     unsigned long long int old = *address_as_ull, assumed;
     do {
          assumed = old;
          old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
        } while (assumed != old);
      return __longlong_as_double(old);
    }
    __device__ float my_atomicAdd(float *addr, float val){
      return atomicAdd(addr, val);
    }
    
    __global__ void scalar_prod(mytype *in1, mytype *in2, mytype *out)
    {
    __shared__ mytype block[THREADS_PER_BLOCK];
    __shared__ mytype s_in1[THREADS_PER_BLOCK];
    __shared__ mytype s_in2[THREADS_PER_BLOCK];
    
    unsigned int xIndex = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
    s_in1[threadIdx.x]=in1[xIndex];
    s_in2[threadIdx.x]=in2[xIndex];
    
    block[threadIdx.x] =  s_in1[threadIdx.x] * s_in2[threadIdx.x];
    __syncthreads();
    my_atomicAdd(out, block[threadIdx.x]);
    }
    
    int main()
    {
    
    int inputLength=320000;
    mytype *hostInput1;
    mytype *hostInput2;
    mytype  hostOutput=0;
    mytype *deviceInput1;
    mytype *deviceInput2;
    mytype *deviceOutput;
    unsigned int i;
    
    hostInput1=(mytype*) malloc(inputLength*sizeof(mytype));
    hostInput2=(mytype*) malloc(inputLength*sizeof(mytype));
    
    for(i=0;i<inputLength;++i)
    {
      hostInput1[i]=10;
      hostInput2[i]=15;
    }
    
    cudaMalloc((void **)&deviceInput1, inputLength * sizeof(mytype));
    cudaMalloc((void **)&deviceInput2, inputLength * sizeof(mytype));
    cudaMalloc((void **)&deviceOutput, sizeof(mytype));
    
    cudaMemcpy(deviceInput1, hostInput1, inputLength *
    sizeof(mytype),cudaMemcpyHostToDevice);
    cudaMemcpy(deviceInput2, hostInput2, inputLength *
    sizeof(mytype),cudaMemcpyHostToDevice);
    
    cudaMemcpy(deviceOutput, &hostOutput,
    sizeof(mytype),cudaMemcpyHostToDevice);
    
    dim3 blockDim(THREADS_PER_BLOCK);
    dim3 gridDim(ceil(inputLength/THREADS_PER_BLOCK));
    
    scalar_prod<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput);
    
    cudaDeviceSynchronize();
    
    cudaMemcpy(&hostOutput, deviceOutput,sizeof(mytype), cudaMemcpyDeviceToHost);
    
    printf("\n result:%f \n",hostOutput);
    
    cudaFree(deviceInput1);
    cudaFree(deviceInput2);
    cudaFree(deviceOutput);
    free(hostInput1);
    free(hostInput2);
    return 0;
    }
    $ nvcc -arch=sm_30 -o t122 t122.cu -DUSE_DOUBLE
    $ ./t122
    
     result:48000000.000000
    $
    
    $cat t122.cu
    #包括
    #按照块1000定义线程
    #ifdef使用双
    typedef双mytype;
    #否则
    typedef-float-mytype;
    #恩迪夫
    __设备uuu双我的u原子添加(双*地址,双val){
    无符号长整型*地址作为(无符号长整型*)地址;
    无符号long long int old=*假定地址为ull;
    做{
    假定=旧;
    old=atomicCAS(地址为ull,假设为,地址为double,地址为longlong(val+,地址为double,假设为));//注意:使用整数比较避免在NaN情况下挂起(因为NaN!=NaN)
    }while(假定的!=旧的);
    返回uuu longlong_u作为u double(旧);
    }
    __设备\uuuuu浮点my\u原子添加(浮点*addr,浮点val){
    返回原子添加(addr,val);
    }
    __全局无效标量产品(mytype*in1,mytype*in2,mytype*out)
    {
    __共享类型块[每个块的线程数];
    __共享_uuuMyTypeS_in1[每个_块的线程数];
    __共享_uuuMyTypeS_in2[每个_块的线程数];
    unsigned int xIndex=blockIdx.x*每个块的线程数+threadIdx.x;
    s_in1[threadIdx.x]=in1[xIndex];
    s_in2[threadIdx.x]=in2[xIndex];
    块[threadIdx.x]=s_in1[threadIdx.x]*s_in2[threadIdx.x];
    __同步线程();
    my_atomicAdd(out,block[threadIdx.x]);
    }
    int main()
    {
    int输入长度=320000;
    mytype*主机输入1;
    mytype*主机输入2;
    mytype hostOutput=0;
    mytype*设备输入1;
    mytype*设备输入2;
    mytype*设备输出;
    无符号整数i;
    hostInput1=(mytype*)malloc(inputLength*sizeof(mytype));
    hostInput2=(mytype*)malloc(inputLength*sizeof(mytype));
    
    对于(i=0;i而言,代码至少存在两个问题:

  • 在开始对设备执行
    atomicAdd
    操作之前,您没有初始化设备输出所指向的存储。因此初始值未定义

  • 您超出了
    float
    算术的能力

  • 第1项的修复非常简单-我们可以在运行内核之前轻松地将其初始化为零。对于第2项,一个简单的“修复”将所有内容从
    float
    切换到
    double
    。但是,在您的Jetson GPU上,我们没有方便的
    atomicAdd
    内置的
    double
    值,但是它为我们提供了一个使用
    atomicCAS
    的可能实现。如果我们将这些功能结合起来,我们可以得到一个工作正常的代码:

    $ cat t122.cu
    #include <stdio.h>
    #define THREADS_PER_BLOCK 1000
    
    #ifdef USE_DOUBLE
    typedef double mytype;
    #else
    typedef float mytype;
    #endif
    
    __device__ double my_atomicAdd(double* address, double val) {
     unsigned long long int* address_as_ull = (unsigned long long int*)address;
     unsigned long long int old = *address_as_ull, assumed;
     do {
          assumed = old;
          old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
        } while (assumed != old);
      return __longlong_as_double(old);
    }
    __device__ float my_atomicAdd(float *addr, float val){
      return atomicAdd(addr, val);
    }
    
    __global__ void scalar_prod(mytype *in1, mytype *in2, mytype *out)
    {
    __shared__ mytype block[THREADS_PER_BLOCK];
    __shared__ mytype s_in1[THREADS_PER_BLOCK];
    __shared__ mytype s_in2[THREADS_PER_BLOCK];
    
    unsigned int xIndex = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
    s_in1[threadIdx.x]=in1[xIndex];
    s_in2[threadIdx.x]=in2[xIndex];
    
    block[threadIdx.x] =  s_in1[threadIdx.x] * s_in2[threadIdx.x];
    __syncthreads();
    my_atomicAdd(out, block[threadIdx.x]);
    }
    
    int main()
    {
    
    int inputLength=320000;
    mytype *hostInput1;
    mytype *hostInput2;
    mytype  hostOutput=0;
    mytype *deviceInput1;
    mytype *deviceInput2;
    mytype *deviceOutput;
    unsigned int i;
    
    hostInput1=(mytype*) malloc(inputLength*sizeof(mytype));
    hostInput2=(mytype*) malloc(inputLength*sizeof(mytype));
    
    for(i=0;i<inputLength;++i)
    {
      hostInput1[i]=10;
      hostInput2[i]=15;
    }
    
    cudaMalloc((void **)&deviceInput1, inputLength * sizeof(mytype));
    cudaMalloc((void **)&deviceInput2, inputLength * sizeof(mytype));
    cudaMalloc((void **)&deviceOutput, sizeof(mytype));
    
    cudaMemcpy(deviceInput1, hostInput1, inputLength *
    sizeof(mytype),cudaMemcpyHostToDevice);
    cudaMemcpy(deviceInput2, hostInput2, inputLength *
    sizeof(mytype),cudaMemcpyHostToDevice);
    
    cudaMemcpy(deviceOutput, &hostOutput,
    sizeof(mytype),cudaMemcpyHostToDevice);
    
    dim3 blockDim(THREADS_PER_BLOCK);
    dim3 gridDim(ceil(inputLength/THREADS_PER_BLOCK));
    
    scalar_prod<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput);
    
    cudaDeviceSynchronize();
    
    cudaMemcpy(&hostOutput, deviceOutput,sizeof(mytype), cudaMemcpyDeviceToHost);
    
    printf("\n result:%f \n",hostOutput);
    
    cudaFree(deviceInput1);
    cudaFree(deviceInput2);
    cudaFree(deviceOutput);
    free(hostInput1);
    free(hostInput2);
    return 0;
    }
    $ nvcc -arch=sm_30 -o t122 t122.cu -DUSE_DOUBLE
    $ ./t122
    
     result:48000000.000000
    $
    
    $cat t122.cu
    #包括
    #按照块1000定义线程
    #ifdef使用双
    typedef双mytype;
    #否则
    typedef-float-mytype;
    #恩迪夫
    __设备uuu双我的u原子添加(双*地址,双val){
    无符号长整型*地址作为(无符号长整型*)地址;
    无符号long long int old=*假定地址为ull;
    做{
    假定=旧;
    old=atomicCAS(地址为ull,假设为,地址为double,地址为longlong(val+,地址为double,假设为));//注意:使用整数比较避免在NaN情况下挂起(因为NaN!=NaN)
    }while(假定的!=旧的);
    返回uuu longlong_u作为u double(旧);
    }
    __设备\uuuuu浮点my\u原子添加(浮点*addr,浮点val){
    返回原子添加(addr,val);
    }
    __全局无效标量产品(mytype*in1,mytype*in2,mytype*out)
    {
    __共享类型块[每个块的线程数];
    __共享_uuuMyTypeS_in1[每个_块的线程数];
    __共享_uuuMyTypeS_in2[每个_块的线程数];
    unsigned int xIndex=blockIdx.x*每个块的线程数+threadIdx.x;
    s_in1[threadIdx.x]=in1[xIndex];
    s_in2[threadIdx.x]=in2[xIndex];
    块[threadIdx.x]=s_in1[threadIdx.x]*s_in2[threadIdx.x];
    __同步线程();
    my_atomicAdd(out,block[threadIdx.x]);
    }
    int main()
    {
    int输入长度=320000;
    mytype*主机输入1;
    mytype*主机输入2;