带C Cuda的无for循环的点积_Cuda

带C Cuda的无for循环的点积

cuda

带C Cuda的无for循环的点积,cuda,Cuda,我正在尝试编写c-cuda代码，以实现在内核中没有for循环的点积。以下代码将分别填充有10和15的输入向量分片到对应的共享浮点数组s_in1和s_in2中；这些数组的每个元素之间的乘法结果存储到共享浮点数组块中。对于大小为32000（inputLength=32000）的输入数组，结果是正确的（4'800'000），但是对于大小为320000（inputLength=320000）的数组，结果是错误的（48'192'608而不是48'000'000）。为什么？即使我使用可变浮点块而不是共享数组

我正在尝试编写c-cuda代码，以实现在内核中没有for循环的点积。以下代码将分别填充有10和15的输入向量分片到对应的共享浮点数组s_in1和s_in2中；这些数组的每个元素之间的乘法结果存储到共享浮点数组块中。对于大小为32000（inputLength=32000）的输入数组，结果是正确的（4'800'000），但是对于大小为320000（inputLength=320000）的数组，结果是错误的（48'192'608而不是48'000'000）。为什么？即使我使用可变浮点块而不是共享数组重写代码，也会出现同样的问题。每次执行代码时，结果总是相同的。提前感谢您的帮助

我在Jetson TX1-CUDA 7.0上编译代码，代码如下：

nvcc mycode.cu -o mycode

这是完整的代码：

#define THREADS_PER_BLOCK 1000

__global__ void scalar_prod(float *in1, float *in2, float *out) 
{

__shared__ float block[THREADS_PER_BLOCK];
__shared__ float s_in1[THREADS_PER_BLOCK];
__shared__ float s_in2[THREADS_PER_BLOCK];

unsigned int xIndex = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];

block[threadIdx.x] =  s_in1[threadIdx.x] * s_in2[threadIdx.x];
__syncthreads();
atomicAdd(out, block[threadIdx.x]);
}

int main()
{

int inputLength=320000;
float *hostInput1;
float *hostInput2;
float  hostOutput=0;
float *deviceInput1;
float *deviceInput2;
float *deviceOutput;
unsigned int i;

hostInput1=(float*) malloc(inputLength*sizeof(float));
hostInput2=(float*) malloc(inputLength*sizeof(float));

for(i=0;i<inputLength;++i)
{
  hostInput1[i]=10;
  hostInput2[i]=15;
}

cudaMalloc((void **)&deviceInput1, inputLength * sizeof(float));
cudaMalloc((void **)&deviceInput2, inputLength * sizeof(float));
cudaMalloc((void **)&deviceOutput, sizeof(float));

cudaMemcpy(deviceInput1, hostInput1, inputLength * 
sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(deviceInput2, hostInput2, inputLength * 
sizeof(float),cudaMemcpyHostToDevice);

dim3 blockDim(THREADS_PER_BLOCK);
dim3 gridDim(ceil(inputLength/THREADS_PER_BLOCK));

scalar_prod<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput);

cudaDeviceSynchronize();

cudaMemcpy(&hostOutput, deviceOutput,sizeof(float), cudaMemcpyDeviceToHost);

printf("\n result:%f \n",hostOutput);

cudaFree(deviceInput1);
cudaFree(deviceInput2);
cudaFree(deviceOutput);
free(hostInput1);
free(hostInput2); 
return 0;     
}

#为每个块1000定义线程
__全局无效标量产品（浮点*in1，浮点*in2，浮点*out）
{
__共享浮点块[每个块的线程数]；
__共享_uuuu浮点s_in1[每个_块的线程数]；
__共享\uuuuu2浮动s\u[每个\u块的线程数]；
unsigned int xIndex=blockIdx.x*每个块的线程数+threadIdx.x；
s_in1[threadIdx.x]=in1[xIndex]；
s_in2[threadIdx.x]=in2[xIndex]；
块[threadIdx.x]=s_in1[threadIdx.x]*s_in2[threadIdx.x]；
__同步线程（）；
atomicAdd（out，block[threadIdx.x]）；
}
int main（）
{
int输入长度=320000；
浮点*主机输入1；
浮点*主机输入2；
浮点输出=0；
浮动*设备输入1；
浮动*设备输入2；
浮动*设备输出；
无符号整数i；
hostInput1=（float*）malloc（inputLength*sizeof（float））；
hostInput2=（float*）malloc（inputLength*sizeof（float））；
对于（i=0；i而言，代码至少存在两个问题：
在开始对设备执行atomicAdd
操作之前，您没有初始化设备输出所指向的存储。因此初始值未定义
您超出了float
算术的能力
第1项的修复非常简单-我们可以在运行内核之前轻松地将其初始化为零。对于第2项，一个简单的“修复”将所有内容从float
切换到double
。但是，在您的Jetson GPU上，我们没有方便的atomicAdd
内置的double
值，但是它为我们提供了一个使用atomicCAS
的可能实现。如果我们将这些功能结合起来，我们可以得到一个工作正常的代码：
$ cat t122.cu
#include <stdio.h>
#define THREADS_PER_BLOCK 1000

#ifdef USE_DOUBLE
typedef double mytype;
#else
typedef float mytype;
#endif

__device__ double my_atomicAdd(double* address, double val) {
 unsigned long long int* address_as_ull = (unsigned long long int*)address;
 unsigned long long int old = *address_as_ull, assumed;
 do {
      assumed = old;
      old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
    } while (assumed != old);
  return __longlong_as_double(old);
}
__device__ float my_atomicAdd(float *addr, float val){
  return atomicAdd(addr, val);
}

__global__ void scalar_prod(mytype *in1, mytype *in2, mytype *out)
{
__shared__ mytype block[THREADS_PER_BLOCK];
__shared__ mytype s_in1[THREADS_PER_BLOCK];
__shared__ mytype s_in2[THREADS_PER_BLOCK];

unsigned int xIndex = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];

block[threadIdx.x] =  s_in1[threadIdx.x] * s_in2[threadIdx.x];
__syncthreads();
my_atomicAdd(out, block[threadIdx.x]);
}

int main()
{

int inputLength=320000;
mytype *hostInput1;
mytype *hostInput2;
mytype  hostOutput=0;
mytype *deviceInput1;
mytype *deviceInput2;
mytype *deviceOutput;
unsigned int i;

hostInput1=(mytype*) malloc(inputLength*sizeof(mytype));
hostInput2=(mytype*) malloc(inputLength*sizeof(mytype));

for(i=0;i<inputLength;++i)
{
  hostInput1[i]=10;
  hostInput2[i]=15;
}

cudaMalloc((void **)&deviceInput1, inputLength * sizeof(mytype));
cudaMalloc((void **)&deviceInput2, inputLength * sizeof(mytype));
cudaMalloc((void **)&deviceOutput, sizeof(mytype));

cudaMemcpy(deviceInput1, hostInput1, inputLength *
sizeof(mytype),cudaMemcpyHostToDevice);
cudaMemcpy(deviceInput2, hostInput2, inputLength *
sizeof(mytype),cudaMemcpyHostToDevice);

cudaMemcpy(deviceOutput, &hostOutput,
sizeof(mytype),cudaMemcpyHostToDevice);

dim3 blockDim(THREADS_PER_BLOCK);
dim3 gridDim(ceil(inputLength/THREADS_PER_BLOCK));

scalar_prod<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput);

cudaDeviceSynchronize();

cudaMemcpy(&hostOutput, deviceOutput,sizeof(mytype), cudaMemcpyDeviceToHost);

printf("\n result:%f \n",hostOutput);

cudaFree(deviceInput1);
cudaFree(deviceInput2);
cudaFree(deviceOutput);
free(hostInput1);
free(hostInput2);
return 0;
}
$ nvcc -arch=sm_30 -o t122 t122.cu -DUSE_DOUBLE
$ ./t122

 result:48000000.000000
$

$cat t122.cu
#包括
#按照块1000定义线程
#ifdef使用双
typedef双mytype；
#否则
typedef-float-mytype；
#恩迪夫
__设备uuu双我的u原子添加（双*地址，双val）{
无符号长整型*地址作为（无符号长整型*）地址；
无符号long long int old=*假定地址为ull；
做{
假定=旧；
old=atomicCAS（地址为ull，假设为，地址为double，地址为longlong（val+，地址为double，假设为））；//注意：使用整数比较避免在NaN情况下挂起（因为NaN！=NaN）
}while（假定的！=旧的）；
返回uuu longlong_u作为u double（旧）；
}
__设备\uuuuu浮点my\u原子添加（浮点*addr，浮点val）{
返回原子添加（addr，val）；
}
__全局无效标量产品（mytype*in1，mytype*in2，mytype*out）
{
__共享类型块[每个块的线程数]；
__共享_uuuMyTypeS_in1[每个_块的线程数]；
__共享_uuuMyTypeS_in2[每个_块的线程数]；
unsigned int xIndex=blockIdx.x*每个块的线程数+threadIdx.x；
s_in1[threadIdx.x]=in1[xIndex]；
s_in2[threadIdx.x]=in2[xIndex]；
块[threadIdx.x]=s_in1[threadIdx.x]*s_in2[threadIdx.x]；
__同步线程（）；
my_atomicAdd（out，block[threadIdx.x]）；
}
int main（）
{
int输入长度=320000；
mytype*主机输入1；
mytype*主机输入2；
mytype hostOutput=0；
mytype*设备输入1；
mytype*设备输入2；
mytype*设备输出；
无符号整数i；
hostInput1=（mytype*）malloc（inputLength*sizeof（mytype））；
hostInput2=（mytype*）malloc（inputLength*sizeof（mytype））；
对于（i=0；i而言，代码至少存在两个问题：
在开始对设备执行atomicAdd
操作之前，您没有初始化设备输出所指向的存储。因此初始值未定义
您超出了float
算术的能力
第1项的修复非常简单-我们可以在运行内核之前轻松地将其初始化为零。对于第2项，一个简单的“修复”将所有内容从float
切换到double
。但是，在您的Jetson GPU上，我们没有方便的atomicAdd
内置的double
值，但是它为我们提供了一个使用atomicCAS
的可能实现。如果我们将这些功能结合起来，我们可以得到一个工作正常的代码：
$ cat t122.cu
#include <stdio.h>
#define THREADS_PER_BLOCK 1000

#ifdef USE_DOUBLE
typedef double mytype;
#else
typedef float mytype;
#endif

__device__ double my_atomicAdd(double* address, double val) {
 unsigned long long int* address_as_ull = (unsigned long long int*)address;
 unsigned long long int old = *address_as_ull, assumed;
 do {
      assumed = old;
      old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
    } while (assumed != old);
  return __longlong_as_double(old);
}
__device__ float my_atomicAdd(float *addr, float val){
  return atomicAdd(addr, val);
}

__global__ void scalar_prod(mytype *in1, mytype *in2, mytype *out)
{
__shared__ mytype block[THREADS_PER_BLOCK];
__shared__ mytype s_in1[THREADS_PER_BLOCK];
__shared__ mytype s_in2[THREADS_PER_BLOCK];

unsigned int xIndex = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];

block[threadIdx.x] =  s_in1[threadIdx.x] * s_in2[threadIdx.x];
__syncthreads();
my_atomicAdd(out, block[threadIdx.x]);
}

int main()
{

int inputLength=320000;
mytype *hostInput1;
mytype *hostInput2;
mytype  hostOutput=0;
mytype *deviceInput1;
mytype *deviceInput2;
mytype *deviceOutput;
unsigned int i;

hostInput1=(mytype*) malloc(inputLength*sizeof(mytype));
hostInput2=(mytype*) malloc(inputLength*sizeof(mytype));

for(i=0;i<inputLength;++i)
{
  hostInput1[i]=10;
  hostInput2[i]=15;
}

cudaMalloc((void **)&deviceInput1, inputLength * sizeof(mytype));
cudaMalloc((void **)&deviceInput2, inputLength * sizeof(mytype));
cudaMalloc((void **)&deviceOutput, sizeof(mytype));

cudaMemcpy(deviceInput1, hostInput1, inputLength *
sizeof(mytype),cudaMemcpyHostToDevice);
cudaMemcpy(deviceInput2, hostInput2, inputLength *
sizeof(mytype),cudaMemcpyHostToDevice);

cudaMemcpy(deviceOutput, &hostOutput,
sizeof(mytype),cudaMemcpyHostToDevice);

dim3 blockDim(THREADS_PER_BLOCK);
dim3 gridDim(ceil(inputLength/THREADS_PER_BLOCK));

scalar_prod<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput);

cudaDeviceSynchronize();

cudaMemcpy(&hostOutput, deviceOutput,sizeof(mytype), cudaMemcpyDeviceToHost);

printf("\n result:%f \n",hostOutput);

cudaFree(deviceInput1);
cudaFree(deviceInput2);
cudaFree(deviceOutput);
free(hostInput1);
free(hostInput2);
return 0;
}
$ nvcc -arch=sm_30 -o t122 t122.cu -DUSE_DOUBLE
$ ./t122

 result:48000000.000000
$

$cat t122.cu
#包括
#按照块1000定义线程
#ifdef使用双
typedef双mytype；
#否则
typedef-float-mytype；
#恩迪夫
__设备uuu双我的u原子添加（双*地址，双val）{
无符号长整型*地址作为（无符号长整型*）地址；
无符号long long int old=*假定地址为ull；
做{
假定=旧；
old=atomicCAS（地址为ull，假设为，地址为double，地址为longlong（val+，地址为double，假设为））；//注意：使用整数比较避免在NaN情况下挂起（因为NaN！=NaN）
}while（假定的！=旧的）；
返回uuu longlong_u作为u double（旧）；
}
__设备\uuuuu浮点my\u原子添加（浮点*addr，浮点val）{
返回原子添加（addr，val）；
}
__全局无效标量产品（mytype*in1，mytype*in2，mytype*out）
{
__共享类型块[每个块的线程数]；
__共享_uuuMyTypeS_in1[每个_块的线程数]；
__共享_uuuMyTypeS_in2[每个_块的线程数]；
unsigned int xIndex=blockIdx.x*每个块的线程数+threadIdx.x；
s_in1[threadIdx.x]=in1[xIndex]；
s_in2[threadIdx.x]=in2[xIndex]；
块[threadIdx.x]=s_in1[threadIdx.x]*s_in2[threadIdx.x]；
__同步线程（）；
my_atomicAdd（out，block[threadIdx.x]）；
}
int main（）
{
int输入长度=320000；
mytype*主机输入1；
mytype*主机输入2；