CUDA内核不返回值
我正在使用具有多个GPU的服务器。我使用openMP一次在多个GPU上启动内核。我看到的问题是,我正在运行的内核似乎没有更新它所传递的推力设备向量中的值。下面的代码应该为设备向量中的所有元素输出一个值1,而不是输出一个值0。代码编译并运行,向我显示内核成功执行 我不明白为什么这段代码没有按预期运行CUDA内核不返回值,cuda,openmp,thrust,Cuda,Openmp,Thrust,我正在使用具有多个GPU的服务器。我使用openMP一次在多个GPU上启动内核。我看到的问题是,我正在运行的内核似乎没有更新它所传递的推力设备向量中的值。下面的代码应该为设备向量中的所有元素输出一个值1,而不是输出一个值0。代码编译并运行,向我显示内核成功执行 我不明白为什么这段代码没有按预期运行 #include <iostream> #include <cmath> #include <omp.h> #include <vector> #inc
#include <iostream>
#include <cmath>
#include <omp.h>
#include <vector>
#include <thrust/host_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/device_malloc.h>
#include <thrust/device_free.h>
#include <thrust/device_vector.h>
using namespace::std;
const long N_R1 = 100;
const long N_R2 = 100;
__global__ void kernel(long* ND, long* NR1,
float* a, float* b, float* c, float* d)
{
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + blockIdx.y * gridDim.x * gridDim.y * blockIdx.z )
* ( threadIdx.z * ( blockDim.x*blockDim.y ) ) + threadIdx.y
* blockDim.x + threadIdx.x;
//Values correspond to 2D array limits
long idxR1 = idx / ND[0];
long idxR2 = idx % ND[0];
if(idxR1 >= NR1[0] || idxR2 >= ND[0])
{
return;
}
a[idx] =1.0;
b[idx] =1.0;
c[idx] =1.0;
d[idx] =1.0;
}
void kernel_wrapper()
{
// GPU Count
int num_gpus = 0;
cudaGetDeviceCount(&num_gpus);
omp_set_num_threads(num_gpus);
//Calculate Dimensioning
long D_total = N_R1 * N_R2;
//Region 1 coordinates are loaded on to each GPU
//Region 2 coordinates are divided up onto GPUs
long R2_stride = ceil(float(N_R2)/float(num_gpus));
//Distance arrays need to be split longo whole sections of region 1.
//(Distances size = N_R1 * N_R2) subset of distance size needs to be N_R1
long D_stride = R2_stride * N_R1;
#pragma omp parallel
{
// Get CPU thread number
long cpu_thread_id = omp_get_thread_num();
cudaSetDevice(cpu_thread_id);
// Set up Local Arrays for distance and potential
// Step 1: Calculate rough Array Limits
// If array spaces divide evenly between threads then beginnings and endings can be calculated below
long R2_begin = cpu_thread_id * R2_stride;
long D_begin = cpu_thread_id * D_stride;
long R2_end = R2_begin + R2_stride;
long D_end = D_begin + D_stride;
// Step 2: Check Ends are not out of bounds
// The last thread in the calculation is likely to have array sizings that are out of bounds
// if this is the case then the ends need to be clipped:
if(R2_end >= N_R2)
{
R2_end = N_R2;
}
if(D_end >= D_total)
{
D_end = D_total;
}
// Local aray sizes are (end - begin)
long l_R2 = R2_end - R2_begin;
long l_D = D_end - D_begin;
float zero = 0.0;
// Create Region 2 potential components
thrust::host_vector<float > a(l_D,zero);
thrust::host_vector<float > b(l_D,zero);
thrust::host_vector<float > c(l_D,zero);
thrust::host_vector<float > d(l_D,zero);
long* p_NR1;
long nr1 = N_R1;
cudaMalloc( (void**)&p_NR1, sizeof(long) );
cudaMemcpy( p_NR1, &nr1, sizeof(long), cudaMemcpyHostToDevice);
long* p_NR2;
cudaMalloc( (void**)&p_NR2, sizeof(long) );
cudaMemcpy( p_NR2, &l_D, sizeof(long), cudaMemcpyHostToDevice);
//Generate Device Side Data for region 2 potential components
thrust::device_vector< float > d_a = a;
thrust::device_vector< float > d_b = b;
thrust::device_vector< float > d_c = c;
thrust::device_vector< float > d_d = d;
// Generate pointers to Device Side Data for region 2 potential components
float* p_a = thrust::raw_pointer_cast(d_a.data());
float* p_b = thrust::raw_pointer_cast(d_b.data());
float* p_c = thrust::raw_pointer_cast(d_c.data());
float* p_d = thrust::raw_pointer_cast(d_d.data());
dim3 blocks = N_R1;
dim3 threads = l_R2;
kernel<<<blocks,threads>>>(p_NR2, p_NR1,
p_a, p_b, p_c, p_d);
cudaDeviceSynchronize();
if(cudaGetLastError() == cudaSuccess)
{
cout << "Kernel Successful!" << cudaGetErrorString(cudaGetLastError()) << endl;
cin.ignore(1);
}
a = d_a;
b = d_b;
c = d_c;
d = d_d;
for(long j = 0; j != a.size(); j++)
{
cout << "a[" << j << "] = " << a[j] << endl;
}
for(long j = 0; j != b.size(); j++)
{
cout << "b[" << j << "] = " << b[j] << endl;
}
for(long j = 0; j != c.size(); j++)
{
cout << "c[" << j << "] = " << c[j] << endl;
}
for(long j = 0; j != c.size(); j++)
{
cout << "c[" << j << "] = " << c[j] << endl;
}
}
cin.ignore(1);
}
int main()
{
kernel_wrapper();
return 0;
}
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
使用namespace::std;
常数长N_R1=100;
常数长N_R2=100;
__全局无效内核(长*ND,长*NR1,
浮点数*a、浮点数*b、浮点数*c、浮点数*d)
{
//计算全局索引(通用三维块、三维线程)
long idx=(blockIdx.x+blockIdx.y*gridDim.x*gridDim.y*blockIdx.z)
*(threadIdx.z*(blockDim.x*blockDim.y))+threadIdx.y
*blockDim.x+threadIdx.x;
//值对应于二维数组限制
长idxR1=idx/ND[0];
长idxR2=idx%ND[0];
如果(idxR1>=NR1[0]| | idxR2>=ND[0])
{
返回;
}
a[idx]=1.0;
b[idx]=1.0;
c[idx]=1.0;
d[idx]=1.0;
}
void kernel_wrapper()
{
//GPU计数
int num_gpus=0;
cudaGetDeviceCount(&numgpu);
omp_设置_num_线程(num_GPU);
//计算尺寸
长D_总计=N_R1*N_R2;
//区域1坐标加载到每个GPU
//区域2坐标被划分到GPU上
长R2_步幅=ceil(浮动(N_R2)/浮动(num_GPU));
//距离阵列需要拆分为区域1的整个部分。
//(距离大小=N_R1*N_R2)距离大小的子集需要为N_R1
长D_步幅=R2_步幅*N_R1;
#pragma-omp并行
{
//获取CPU线程数
长cpu_线程_id=omp_get_thread_num();
cudaSetDevice(cpu线程id);
//为距离和电位设置本地阵列
//步骤1:计算粗略数组限制
//如果数组空间在线程之间平均分配,则可以在下面计算开始和结束
长R2\u开始=cpu\u线程\u id*R2\u步幅;
长D_开始=cpu_线程id*D_步幅;
长R2\u结束=R2\u开始+R2\u步幅;
长距离D_结束=D_开始+D_大步;
//第2步:检查端点是否超出范围
//计算中的最后一个线程可能具有超出边界的数组大小
//如果是这种情况,则需要修剪端部:
如果(R2\u end>=N\u R2)
{
R2_end=N_R2;
}
如果(D_结束>=D_总计)
{
D_end=D_总数;
}
//本地aray大小为(结束-开始)
长l_R2=R2_结束-R2_开始;
长l_D=D_end-D_begin;
浮动零点=0.0;
//创建区域2潜在组件
推力:主向量a(l_D,零);
推力:主向量b(l_D,零);
推力:主向量c(l_D,零);
推力:主向量d(l\d,零);
长*p_NR1;
长nr1=N_R1;
Cudamaloc((void**)和p_NR1,sizeof(long));
cudaMemcpy(p_NR1和NR1,sizeof(long),cudaMemcpyHostToDevice);
长*p_NR2;
Cudamaloc((void**)和p_NR2,sizeof(long));
cudaMemcpy(p_NR2和l_D,sizeof(long),cudaMemcpyHostToDevice);
//为区域2潜在组件生成设备端数据
推力:装置矢量d\u a=a;
推力:设备_向量d_b=b;
推力:装置矢量d\u c=c;
推力:设备_矢量d_d=d;
//为区域2潜在组件生成指向设备端数据的指针
float*p_a=推力::原始指针(d_a.data());
float*p_b=推力::原始指针(d_b.data());
float*p_c=推力::原始指针(d_c.data());
float*p_d=推力::原始指针(d_d.data());
dim3块=N_R1;
dim3螺纹=l_R2;
内核(p_NR2,p_NR1,
p_a,p_b,p_c,p_d);
cudaDeviceSynchronize();
if(cudaGetLastError()==cudaSuccess)
{
cout一些输出值被设置为1,一些未设置。问题是由以下语句引起的:
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + blockIdx.y * gridDim.x * gridDim.y * blockIdx.z )
* ( threadIdx.z * ( blockDim.x*blockDim.y ) ) + threadIdx.y
* blockDim.x + threadIdx.x;
这不是我所说的将3D网格/块转换为全局唯一的1D索引的正确通用转换,我假设这是您的意图。让我们只举一个例子来证明它是错误的。假设您正在启动一个由1D块组成的1D网格(这就是您正在做的)。然后所有(块、线程)Idx.y和.z变量都将为零。在该启动配置中,只有blockIdx.x和threadIdx.x可以采用非零值
在这种情况下,表达式将简化为:
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + 0 * gridDim.x * gridDim.y * 0 )
* ( 0 * ( blockDim.x*blockDim.y ) ) + 0
* blockDim.x + threadIdx.x;
long idx = threadIdx.x;
i、 e.减少到:
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + 0 * gridDim.x * gridDim.y * 0 )
* ( 0 * ( blockDim.x*blockDim.y ) ) + 0
* blockDim.x + threadIdx.x;
long idx = threadIdx.x;
因此,数组的第一个(块大小)元素(a、b、c、d)由于threadIdx.x
从一个块到下一个块不是唯一的,因此这不是一个正确的全局唯一线程ID,因此每个块都在写入相同的输出位置,而不是每个块负责阵列的单独部分
那么,什么是可能的(正确的)通用三维到一维索引转换
答案是这样的(可能还有其他地方)。这个答案实际上只将3D网格加1D块配置转换为全局唯一的ID,但对于演示代码中的错误已经足够了
当我用该代码替换您的idx
内核内计算时,根据我的测试,您的内核会用1.0填充所有数组项。一些输出值设置为1,一些则不是。问题是由于以下语句:
// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + blockIdx.y * gridDim.x * gridDim.y * blockIdx.z )
* ( threadIdx.z * ( blockDim.x*blockDim.y ) ) + threadIdx.y
* blockDim.x + threadIdx.x;
这不是我所说的3D网格/块到全局唯一1D索引的适当通用转换,这