带C Cuda的无for循环的点积
我正在尝试编写c-cuda代码,以实现在内核中没有for循环的点积。以下代码将分别填充有10和15的输入向量分片到对应的共享浮点数组s_in1和s_in2中;这些数组的每个元素之间的乘法结果存储到共享浮点数组块中。对于大小为32000(inputLength=32000)的输入数组,结果是正确的(4'800'000),但是对于大小为320000(inputLength=320000)的数组,结果是错误的(48'192'608而不是48'000'000)。为什么?即使我使用可变浮点块而不是共享数组重写代码,也会出现同样的问题。每次执行代码时,结果总是相同的。提前感谢您的帮助 我在Jetson TX1-CUDA 7.0上编译代码,代码如下:带C Cuda的无for循环的点积,cuda,Cuda,我正在尝试编写c-cuda代码,以实现在内核中没有for循环的点积。以下代码将分别填充有10和15的输入向量分片到对应的共享浮点数组s_in1和s_in2中;这些数组的每个元素之间的乘法结果存储到共享浮点数组块中。对于大小为32000(inputLength=32000)的输入数组,结果是正确的(4'800'000),但是对于大小为320000(inputLength=320000)的数组,结果是错误的(48'192'608而不是48'000'000)。为什么?即使我使用可变浮点块而不是共享数组
nvcc mycode.cu -o mycode
这是完整的代码:
#define THREADS_PER_BLOCK 1000
__global__ void scalar_prod(float *in1, float *in2, float *out)
{
__shared__ float block[THREADS_PER_BLOCK];
__shared__ float s_in1[THREADS_PER_BLOCK];
__shared__ float s_in2[THREADS_PER_BLOCK];
unsigned int xIndex = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];
block[threadIdx.x] = s_in1[threadIdx.x] * s_in2[threadIdx.x];
__syncthreads();
atomicAdd(out, block[threadIdx.x]);
}
int main()
{
int inputLength=320000;
float *hostInput1;
float *hostInput2;
float hostOutput=0;
float *deviceInput1;
float *deviceInput2;
float *deviceOutput;
unsigned int i;
hostInput1=(float*) malloc(inputLength*sizeof(float));
hostInput2=(float*) malloc(inputLength*sizeof(float));
for(i=0;i<inputLength;++i)
{
hostInput1[i]=10;
hostInput2[i]=15;
}
cudaMalloc((void **)&deviceInput1, inputLength * sizeof(float));
cudaMalloc((void **)&deviceInput2, inputLength * sizeof(float));
cudaMalloc((void **)&deviceOutput, sizeof(float));
cudaMemcpy(deviceInput1, hostInput1, inputLength *
sizeof(float),cudaMemcpyHostToDevice);
cudaMemcpy(deviceInput2, hostInput2, inputLength *
sizeof(float),cudaMemcpyHostToDevice);
dim3 blockDim(THREADS_PER_BLOCK);
dim3 gridDim(ceil(inputLength/THREADS_PER_BLOCK));
scalar_prod<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput);
cudaDeviceSynchronize();
cudaMemcpy(&hostOutput, deviceOutput,sizeof(float), cudaMemcpyDeviceToHost);
printf("\n result:%f \n",hostOutput);
cudaFree(deviceInput1);
cudaFree(deviceInput2);
cudaFree(deviceOutput);
free(hostInput1);
free(hostInput2);
return 0;
}
#为每个块1000定义线程
__全局无效标量产品(浮点*in1,浮点*in2,浮点*out)
{
__共享浮点块[每个块的线程数];
__共享_uuuu浮点s_in1[每个_块的线程数];
__共享\uuuuu2浮动s\u[每个\u块的线程数];
unsigned int xIndex=blockIdx.x*每个块的线程数+threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];
块[threadIdx.x]=s_in1[threadIdx.x]*s_in2[threadIdx.x];
__同步线程();
atomicAdd(out,block[threadIdx.x]);
}
int main()
{
int输入长度=320000;
浮点*主机输入1;
浮点*主机输入2;
浮点输出=0;
浮动*设备输入1;
浮动*设备输入2;
浮动*设备输出;
无符号整数i;
hostInput1=(float*)malloc(inputLength*sizeof(float));
hostInput2=(float*)malloc(inputLength*sizeof(float));
对于(i=0;i而言,代码至少存在两个问题:
在开始对设备执行atomicAdd
操作之前,您没有初始化设备输出所指向的存储。因此初始值未定义
您超出了float
算术的能力
第1项的修复非常简单-我们可以在运行内核之前轻松地将其初始化为零。对于第2项,一个简单的“修复”将所有内容从float
切换到double
。但是,在您的Jetson GPU上,我们没有方便的atomicAdd
内置的double
值,但是它为我们提供了一个使用atomicCAS
的可能实现。如果我们将这些功能结合起来,我们可以得到一个工作正常的代码:
$ cat t122.cu
#include <stdio.h>
#define THREADS_PER_BLOCK 1000
#ifdef USE_DOUBLE
typedef double mytype;
#else
typedef float mytype;
#endif
__device__ double my_atomicAdd(double* address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __longlong_as_double(old);
}
__device__ float my_atomicAdd(float *addr, float val){
return atomicAdd(addr, val);
}
__global__ void scalar_prod(mytype *in1, mytype *in2, mytype *out)
{
__shared__ mytype block[THREADS_PER_BLOCK];
__shared__ mytype s_in1[THREADS_PER_BLOCK];
__shared__ mytype s_in2[THREADS_PER_BLOCK];
unsigned int xIndex = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];
block[threadIdx.x] = s_in1[threadIdx.x] * s_in2[threadIdx.x];
__syncthreads();
my_atomicAdd(out, block[threadIdx.x]);
}
int main()
{
int inputLength=320000;
mytype *hostInput1;
mytype *hostInput2;
mytype hostOutput=0;
mytype *deviceInput1;
mytype *deviceInput2;
mytype *deviceOutput;
unsigned int i;
hostInput1=(mytype*) malloc(inputLength*sizeof(mytype));
hostInput2=(mytype*) malloc(inputLength*sizeof(mytype));
for(i=0;i<inputLength;++i)
{
hostInput1[i]=10;
hostInput2[i]=15;
}
cudaMalloc((void **)&deviceInput1, inputLength * sizeof(mytype));
cudaMalloc((void **)&deviceInput2, inputLength * sizeof(mytype));
cudaMalloc((void **)&deviceOutput, sizeof(mytype));
cudaMemcpy(deviceInput1, hostInput1, inputLength *
sizeof(mytype),cudaMemcpyHostToDevice);
cudaMemcpy(deviceInput2, hostInput2, inputLength *
sizeof(mytype),cudaMemcpyHostToDevice);
cudaMemcpy(deviceOutput, &hostOutput,
sizeof(mytype),cudaMemcpyHostToDevice);
dim3 blockDim(THREADS_PER_BLOCK);
dim3 gridDim(ceil(inputLength/THREADS_PER_BLOCK));
scalar_prod<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput);
cudaDeviceSynchronize();
cudaMemcpy(&hostOutput, deviceOutput,sizeof(mytype), cudaMemcpyDeviceToHost);
printf("\n result:%f \n",hostOutput);
cudaFree(deviceInput1);
cudaFree(deviceInput2);
cudaFree(deviceOutput);
free(hostInput1);
free(hostInput2);
return 0;
}
$ nvcc -arch=sm_30 -o t122 t122.cu -DUSE_DOUBLE
$ ./t122
result:48000000.000000
$
$cat t122.cu
#包括
#按照块1000定义线程
#ifdef使用双
typedef双mytype;
#否则
typedef-float-mytype;
#恩迪夫
__设备uuu双我的u原子添加(双*地址,双val){
无符号长整型*地址作为(无符号长整型*)地址;
无符号long long int old=*假定地址为ull;
做{
假定=旧;
old=atomicCAS(地址为ull,假设为,地址为double,地址为longlong(val+,地址为double,假设为));//注意:使用整数比较避免在NaN情况下挂起(因为NaN!=NaN)
}while(假定的!=旧的);
返回uuu longlong_u作为u double(旧);
}
__设备\uuuuu浮点my\u原子添加(浮点*addr,浮点val){
返回原子添加(addr,val);
}
__全局无效标量产品(mytype*in1,mytype*in2,mytype*out)
{
__共享类型块[每个块的线程数];
__共享_uuuMyTypeS_in1[每个_块的线程数];
__共享_uuuMyTypeS_in2[每个_块的线程数];
unsigned int xIndex=blockIdx.x*每个块的线程数+threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];
块[threadIdx.x]=s_in1[threadIdx.x]*s_in2[threadIdx.x];
__同步线程();
my_atomicAdd(out,block[threadIdx.x]);
}
int main()
{
int输入长度=320000;
mytype*主机输入1;
mytype*主机输入2;
mytype hostOutput=0;
mytype*设备输入1;
mytype*设备输入2;
mytype*设备输出;
无符号整数i;
hostInput1=(mytype*)malloc(inputLength*sizeof(mytype));
hostInput2=(mytype*)malloc(inputLength*sizeof(mytype));
对于(i=0;i而言,代码至少存在两个问题:
在开始对设备执行atomicAdd
操作之前,您没有初始化设备输出所指向的存储。因此初始值未定义
您超出了float
算术的能力
第1项的修复非常简单-我们可以在运行内核之前轻松地将其初始化为零。对于第2项,一个简单的“修复”将所有内容从float
切换到double
。但是,在您的Jetson GPU上,我们没有方便的atomicAdd
内置的double
值,但是它为我们提供了一个使用atomicCAS
的可能实现。如果我们将这些功能结合起来,我们可以得到一个工作正常的代码:
$ cat t122.cu
#include <stdio.h>
#define THREADS_PER_BLOCK 1000
#ifdef USE_DOUBLE
typedef double mytype;
#else
typedef float mytype;
#endif
__device__ double my_atomicAdd(double* address, double val) {
unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
} while (assumed != old);
return __longlong_as_double(old);
}
__device__ float my_atomicAdd(float *addr, float val){
return atomicAdd(addr, val);
}
__global__ void scalar_prod(mytype *in1, mytype *in2, mytype *out)
{
__shared__ mytype block[THREADS_PER_BLOCK];
__shared__ mytype s_in1[THREADS_PER_BLOCK];
__shared__ mytype s_in2[THREADS_PER_BLOCK];
unsigned int xIndex = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];
block[threadIdx.x] = s_in1[threadIdx.x] * s_in2[threadIdx.x];
__syncthreads();
my_atomicAdd(out, block[threadIdx.x]);
}
int main()
{
int inputLength=320000;
mytype *hostInput1;
mytype *hostInput2;
mytype hostOutput=0;
mytype *deviceInput1;
mytype *deviceInput2;
mytype *deviceOutput;
unsigned int i;
hostInput1=(mytype*) malloc(inputLength*sizeof(mytype));
hostInput2=(mytype*) malloc(inputLength*sizeof(mytype));
for(i=0;i<inputLength;++i)
{
hostInput1[i]=10;
hostInput2[i]=15;
}
cudaMalloc((void **)&deviceInput1, inputLength * sizeof(mytype));
cudaMalloc((void **)&deviceInput2, inputLength * sizeof(mytype));
cudaMalloc((void **)&deviceOutput, sizeof(mytype));
cudaMemcpy(deviceInput1, hostInput1, inputLength *
sizeof(mytype),cudaMemcpyHostToDevice);
cudaMemcpy(deviceInput2, hostInput2, inputLength *
sizeof(mytype),cudaMemcpyHostToDevice);
cudaMemcpy(deviceOutput, &hostOutput,
sizeof(mytype),cudaMemcpyHostToDevice);
dim3 blockDim(THREADS_PER_BLOCK);
dim3 gridDim(ceil(inputLength/THREADS_PER_BLOCK));
scalar_prod<<<gridDim, blockDim>>>(deviceInput1, deviceInput2, deviceOutput);
cudaDeviceSynchronize();
cudaMemcpy(&hostOutput, deviceOutput,sizeof(mytype), cudaMemcpyDeviceToHost);
printf("\n result:%f \n",hostOutput);
cudaFree(deviceInput1);
cudaFree(deviceInput2);
cudaFree(deviceOutput);
free(hostInput1);
free(hostInput2);
return 0;
}
$ nvcc -arch=sm_30 -o t122 t122.cu -DUSE_DOUBLE
$ ./t122
result:48000000.000000
$
$cat t122.cu
#包括
#按照块1000定义线程
#ifdef使用双
typedef双mytype;
#否则
typedef-float-mytype;
#恩迪夫
__设备uuu双我的u原子添加(双*地址,双val){
无符号长整型*地址作为(无符号长整型*)地址;
无符号long long int old=*假定地址为ull;
做{
假定=旧;
old=atomicCAS(地址为ull,假设为,地址为double,地址为longlong(val+,地址为double,假设为));//注意:使用整数比较避免在NaN情况下挂起(因为NaN!=NaN)
}while(假定的!=旧的);
返回uuu longlong_u作为u double(旧);
}
__设备\uuuuu浮点my\u原子添加(浮点*addr,浮点val){
返回原子添加(addr,val);
}
__全局无效标量产品(mytype*in1,mytype*in2,mytype*out)
{
__共享类型块[每个块的线程数];
__共享_uuuMyTypeS_in1[每个_块的线程数];
__共享_uuuMyTypeS_in2[每个_块的线程数];
unsigned int xIndex=blockIdx.x*每个块的线程数+threadIdx.x;
s_in1[threadIdx.x]=in1[xIndex];
s_in2[threadIdx.x]=in2[xIndex];
块[threadIdx.x]=s_in1[threadIdx.x]*s_in2[threadIdx.x];
__同步线程();
my_atomicAdd(out,block[threadIdx.x]);
}
int main()
{
int输入长度=320000;
mytype*主机输入1;
mytype*主机输入2;