CUDA内核不返回值

CUDA内核不返回值,cuda,openmp,thrust,Cuda,Openmp,Thrust,我正在使用具有多个GPU的服务器。我使用openMP一次在多个GPU上启动内核。我看到的问题是,我正在运行的内核似乎没有更新它所传递的推力设备向量中的值。下面的代码应该为设备向量中的所有元素输出一个值1,而不是输出一个值0。代码编译并运行,向我显示内核成功执行 我不明白为什么这段代码没有按预期运行 #include <iostream> #include <cmath> #include <omp.h> #include <vector> #inc

我正在使用具有多个GPU的服务器。我使用openMP一次在多个GPU上启动内核。我看到的问题是,我正在运行的内核似乎没有更新它所传递的推力设备向量中的值。下面的代码应该为设备向量中的所有元素输出一个值1,而不是输出一个值0。代码编译并运行,向我显示内核成功执行

我不明白为什么这段代码没有按预期运行

#include <iostream>
#include <cmath>
#include <omp.h>
#include <vector>
#include <thrust/host_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/device_malloc.h>
#include <thrust/device_free.h>
#include <thrust/device_vector.h>



using namespace::std;


const long N_R1 = 100;
const long N_R2 = 100;


__global__ void kernel(long* ND, long* NR1, 
                       float* a, float* b, float* c, float* d)

{
    // Calculate Global index (Generic 3D block, 3D thread)
    long idx = ( blockIdx.x + blockIdx.y * gridDim.x * gridDim.y * blockIdx.z )
              * ( threadIdx.z * ( blockDim.x*blockDim.y ) ) + threadIdx.y 
              * blockDim.x + threadIdx.x;

    //Values correspond to 2D array limits
    long idxR1 = idx / ND[0];
    long idxR2 = idx % ND[0];

    if(idxR1 >= NR1[0] || idxR2 >= ND[0])
    {
        return;
    }

        a[idx] =1.0;
        b[idx] =1.0;
        c[idx] =1.0;
        d[idx] =1.0;

}


void kernel_wrapper()
{
    // GPU Count
    int num_gpus = 0;
    cudaGetDeviceCount(&num_gpus);
    omp_set_num_threads(num_gpus);

    //Calculate Dimensioning
    long D_total = N_R1 * N_R2;
    //Region 1 coordinates are loaded on to each GPU
    //Region 2 coordinates are divided up onto GPUs
    long R2_stride = ceil(float(N_R2)/float(num_gpus));

    //Distance arrays need to be split longo whole sections of region 1. 
    //(Distances size = N_R1 * N_R2) subset of distance size needs to be N_R1
    long D_stride = R2_stride * N_R1;


#pragma omp parallel
    {

        // Get CPU thread number
        long cpu_thread_id = omp_get_thread_num();

        cudaSetDevice(cpu_thread_id);

        // Set up Local Arrays for distance and potential
        // Step 1: Calculate rough Array Limits
        // If array spaces divide evenly between threads then beginnings and endings can be calculated below
        long R2_begin = cpu_thread_id * R2_stride;
        long D_begin  = cpu_thread_id * D_stride;

        long R2_end = R2_begin + R2_stride;
        long D_end  = D_begin + D_stride;

        // Step 2: Check Ends are not out of bounds
        //         The last thread in the calculation is likely to have array sizings that are out of bounds
        //         if this is the case then the ends need to be clipped:
        if(R2_end >= N_R2)
        {
            R2_end = N_R2;
        }
        if(D_end >= D_total)
        {
            D_end = D_total;
        }

        // Local aray sizes are (end - begin)
        long l_R2 = R2_end - R2_begin;
        long l_D     = D_end - D_begin;

        float zero = 0.0;
        // Create Region 2 potential components
        thrust::host_vector<float > a(l_D,zero);
        thrust::host_vector<float > b(l_D,zero);
        thrust::host_vector<float > c(l_D,zero);
        thrust::host_vector<float > d(l_D,zero);

        long* p_NR1;
        long nr1 = N_R1;
        cudaMalloc( (void**)&p_NR1, sizeof(long) );
        cudaMemcpy( p_NR1, &nr1, sizeof(long), cudaMemcpyHostToDevice);

        long* p_NR2;
        cudaMalloc( (void**)&p_NR2, sizeof(long) );
        cudaMemcpy( p_NR2, &l_D, sizeof(long), cudaMemcpyHostToDevice);

        //Generate Device Side Data for region 2 potential components
        thrust::device_vector< float > d_a = a;
        thrust::device_vector< float > d_b = b;
        thrust::device_vector< float > d_c = c;
        thrust::device_vector< float > d_d = d;
        // Generate pointers to Device Side Data for region 2 potential components
        float* p_a = thrust::raw_pointer_cast(d_a.data());
        float* p_b = thrust::raw_pointer_cast(d_b.data());
        float* p_c = thrust::raw_pointer_cast(d_c.data());
        float* p_d = thrust::raw_pointer_cast(d_d.data());

        dim3 blocks = N_R1;
        dim3 threads = l_R2;
        kernel<<<blocks,threads>>>(p_NR2, p_NR1,
                                   p_a, p_b, p_c, p_d);
        cudaDeviceSynchronize();
        if(cudaGetLastError() == cudaSuccess)
        {
            cout << "Kernel Successful!" << cudaGetErrorString(cudaGetLastError()) << endl;
            cin.ignore(1);
        }

        a = d_a;
        b = d_b;
        c = d_c;
        d = d_d;

        for(long j = 0; j != a.size(); j++)
        {
            cout << "a[" << j << "] = " << a[j] << endl;
        }
        for(long j = 0; j != b.size(); j++)
        {
            cout << "b[" << j << "] = " << b[j] << endl;
        }
        for(long j = 0; j != c.size(); j++)
        {
            cout << "c[" << j << "] = " << c[j] << endl;
        }
        for(long j = 0; j != c.size(); j++)
        {
            cout << "c[" << j << "] = " << c[j] << endl;
        }
}
        cin.ignore(1);
}

int main()
{

    kernel_wrapper();

    return 0;
}
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
#包括
使用namespace::std;
常数长N_R1=100;
常数长N_R2=100;
__全局无效内核(长*ND,长*NR1,
浮点数*a、浮点数*b、浮点数*c、浮点数*d)
{
//计算全局索引(通用三维块、三维线程)
long idx=(blockIdx.x+blockIdx.y*gridDim.x*gridDim.y*blockIdx.z)
*(threadIdx.z*(blockDim.x*blockDim.y))+threadIdx.y
*blockDim.x+threadIdx.x;
//值对应于二维数组限制
长idxR1=idx/ND[0];
长idxR2=idx%ND[0];
如果(idxR1>=NR1[0]| | idxR2>=ND[0])
{
返回;
}
a[idx]=1.0;
b[idx]=1.0;
c[idx]=1.0;
d[idx]=1.0;
}
void kernel_wrapper()
{
//GPU计数
int num_gpus=0;
cudaGetDeviceCount(&numgpu);
omp_设置_num_线程(num_GPU);
//计算尺寸
长D_总计=N_R1*N_R2;
//区域1坐标加载到每个GPU
//区域2坐标被划分到GPU上
长R2_步幅=ceil(浮动(N_R2)/浮动(num_GPU));
//距离阵列需要拆分为区域1的整个部分。
//(距离大小=N_R1*N_R2)距离大小的子集需要为N_R1
长D_步幅=R2_步幅*N_R1;
#pragma-omp并行
{
//获取CPU线程数
长cpu_线程_id=omp_get_thread_num();
cudaSetDevice(cpu线程id);
//为距离和电位设置本地阵列
//步骤1:计算粗略数组限制
//如果数组空间在线程之间平均分配,则可以在下面计算开始和结束
长R2\u开始=cpu\u线程\u id*R2\u步幅;
长D_开始=cpu_线程id*D_步幅;
长R2\u结束=R2\u开始+R2\u步幅;
长距离D_结束=D_开始+D_大步;
//第2步:检查端点是否超出范围
//计算中的最后一个线程可能具有超出边界的数组大小
//如果是这种情况,则需要修剪端部:
如果(R2\u end>=N\u R2)
{
R2_end=N_R2;
}
如果(D_结束>=D_总计)
{
D_end=D_总数;
}
//本地aray大小为(结束-开始)
长l_R2=R2_结束-R2_开始;
长l_D=D_end-D_begin;
浮动零点=0.0;
//创建区域2潜在组件
推力:主向量a(l_D,零);
推力:主向量b(l_D,零);
推力:主向量c(l_D,零);
推力:主向量d(l\d,零);
长*p_NR1;
长nr1=N_R1;
Cudamaloc((void**)和p_NR1,sizeof(long));
cudaMemcpy(p_NR1和NR1,sizeof(long),cudaMemcpyHostToDevice);
长*p_NR2;
Cudamaloc((void**)和p_NR2,sizeof(long));
cudaMemcpy(p_NR2和l_D,sizeof(long),cudaMemcpyHostToDevice);
//为区域2潜在组件生成设备端数据
推力:装置矢量d\u a=a;
推力:设备_向量d_b=b;
推力:装置矢量d\u c=c;
推力:设备_矢量d_d=d;
//为区域2潜在组件生成指向设备端数据的指针
float*p_a=推力::原始指针(d_a.data());
float*p_b=推力::原始指针(d_b.data());
float*p_c=推力::原始指针(d_c.data());
float*p_d=推力::原始指针(d_d.data());
dim3块=N_R1;
dim3螺纹=l_R2;
内核(p_NR2,p_NR1,
p_a,p_b,p_c,p_d);
cudaDeviceSynchronize();
if(cudaGetLastError()==cudaSuccess)
{

cout一些输出值被设置为1,一些未设置。问题是由以下语句引起的:

// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + blockIdx.y * gridDim.x * gridDim.y * blockIdx.z )
          * ( threadIdx.z * ( blockDim.x*blockDim.y ) ) + threadIdx.y 
          * blockDim.x + threadIdx.x;
这不是我所说的将3D网格/块转换为全局唯一的1D索引的正确通用转换,我假设这是您的意图。让我们只举一个例子来证明它是错误的。假设您正在启动一个由1D块组成的1D网格(这就是您正在做的)。然后所有(块、线程)Idx.y和.z变量都将为零。在该启动配置中,只有blockIdx.x和threadIdx.x可以采用非零值

在这种情况下,表达式将简化为:

// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + 0 * gridDim.x * gridDim.y * 0 )
          * ( 0 * ( blockDim.x*blockDim.y ) ) + 0 
          * blockDim.x + threadIdx.x;
long idx = threadIdx.x;
i、 e.减少到:

// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + 0 * gridDim.x * gridDim.y * 0 )
          * ( 0 * ( blockDim.x*blockDim.y ) ) + 0 
          * blockDim.x + threadIdx.x;
long idx = threadIdx.x;
因此,数组的第一个(块大小)元素(a、b、c、d)由于
threadIdx.x
从一个块到下一个块不是唯一的,因此这不是一个正确的全局唯一线程ID,因此每个块都在写入相同的输出位置,而不是每个块负责阵列的单独部分

那么,什么是可能的(正确的)通用三维到一维索引转换

答案是这样的(可能还有其他地方)。这个答案实际上只将3D网格加1D块配置转换为全局唯一的ID,但对于演示代码中的错误已经足够了


当我用该代码替换您的
idx
内核内计算时,根据我的测试,您的内核会用1.0填充所有数组项。

一些输出值设置为1,一些则不是。问题是由于以下语句:

// Calculate Global index (Generic 3D block, 3D thread)
long idx = ( blockIdx.x + blockIdx.y * gridDim.x * gridDim.y * blockIdx.z )
          * ( threadIdx.z * ( blockDim.x*blockDim.y ) ) + threadIdx.y 
          * blockDim.x + threadIdx.x;
这不是我所说的3D网格/块到全局唯一1D索引的适当通用转换,这