C++ CUDA中设备类的类型限定符_C++_Scope_Cuda_Device_Qualifiers

C++ CUDA中设备类的类型限定符

c++ cuda

C++ CUDA中设备类的类型限定符,c++,scope,cuda,device,qualifiers,C++,Scope,Cuda,Device,Qualifiers,我目前正试图用一个类制作一段CUDA代码，该类将仅用于设备端（即主机不需要知道它的存在）。但是，我无法为类计算出正确的限定符（deviceclass如下）： \uuuuu设备\uuuuuu浮点数设备函数（float*x）{返回x[0]+x[1]；} 类设备类{ 私人：浮球；公众： deviceclass（float*x）{u a=devicefunction（x）；} float getvalue（）{return\u a；} }; //设备代码 __全局无效向量初始化（浮点*A，整

我目前正试图用一个类制作一段CUDA代码，该类将仅用于设备端（即主机不需要知道它的存在）。但是，我无法为类计算出正确的限定符（

deviceclass

如下）：

\uuuuu设备\uuuuuu浮点数设备函数（float*x）{返回x[0]+x[1]；}
类设备类{
私人：
浮球；
公众：
deviceclass（float*x）{u a=devicefunction（x）；}
float getvalue（）{return\u a；}
};    
//设备代码
__全局无效向量初始化（浮点*A，整数N）
{
int i=blockDim.x*blockIdx.x+threadIdx.x；
if（i获取值（）；
}
}
//标准CUDA废话如下：变量
浮动*h_A，*d_A；
//主机代码
int main（int argc，字符**argv）
{
printf（“向量初始化…\n”）；
int N=10000；
大小\u t size=N*sizeof（浮动）；
//分配
h_A=（浮动*）malloc（大小）；
Cudamaloc（和d_A，尺寸）；
printf（“计算…\n”）；
//调用内核
int threadsPerBlock=256；
int blocksPerGrid=（N+threadsPerBlock-1）/threadsPerBlock；
VecInit（d_A，N）；
//将结果从设备内存复制到主机内存
cudaMemcpy（h_A、d_A、大小、cudaMemcpyDeviceToHost）；
//…等等
}

将

设备类

单独设置为

设备类

会在从全局函数调用时引发错误，但是将其设置为

设备类

或

全局类

似乎没有必要。有人能给我指出正确的方向吗？

我认为

Node（）

是一个打字错误

来自CUDA C编程指南，第3.1.5节：

但是，对于设备代码

，仅支持C++的子集。和附录D.6：

编译为具有2和x和更高计算能力的设备的代码可以使用C++类…

我认为你的代码使用不兼容的C++。

，结果证明必须符合类的成员函数，下面是一个完全工作的版本：

#include <iostream>
#include <stdio.h>
#include <stdlib.h>

using namespace std;

void Cleanup(void);


// Functions to be pointed to
__device__ float Plus (float a, float b) {return a+b;}

class deviceclass {

    private:
        float test;

    public:
        __device__ deviceclass(float a, float b) {
            test = Plus(a,b);
        }

        __device__ float getvalue() {return test;}
};

// Device code
__global__ void VecInit(float* A, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < N) {
        deviceclass test(1.0, 2.0);

        A[i] = test.getvalue();
    }
}

// Standard CUDA guff below: Variables
float *h_A, *d_A;

// Host code
int main(int argc, char** argv)
{
    printf("Vector initialization...\n");
    int N = 10000;
    size_t size = N * sizeof(float);

    // Allocate
    h_A = (float*)malloc(size);
    cudaMalloc(&d_A, size);

    printf("Computing...\n");
    // Invoke kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    VecInit<<<blocksPerGrid, threadsPerBlock>>>(d_A, N);

    // Copy result from device memory to host memory
    cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);



    // Verify result
    int i;
    for (i = 0; i < N; ++i) {
        cout << endl << h_A[i];
    }

    cout << endl;

    Cleanup();
}

void Cleanup(void)
{
    // Free device memory
    if (d_A)
        cudaFree(d_A);

    // Free host memory
    if (h_A)
        free(h_A);

    cudaThreadExit();

    exit(0);
}

#包括
#包括
#包括
使用名称空间std；
空洞清理（空洞）；
//需要指出的功能
__设备_uuuuuuufloatplus（浮点a，浮点b）{返回a+b；}
类设备类{
私人：
浮球试验；
公众：
__设备\设备类（浮动a、浮动b）{
试验=加（a，b）；
}
__设备\uuuufloat getvalue（）{return test；}
};
//设备代码
__全局无效向量初始化（浮点*A，整数N）
{
int i=blockDim.x*blockIdx.x+threadIdx.x；
if（i NODE（）/CUT>是一个类型，它是正确类的名字（它必须滑入！）。但是我不使用任何虚拟成员函数，它是CUDA中类的主要限制，所有C++都在CUDA中有效。
#include <iostream>
#include <stdio.h>
#include <stdlib.h>

using namespace std;

void Cleanup(void);


// Functions to be pointed to
__device__ float Plus (float a, float b) {return a+b;}

class deviceclass {

    private:
        float test;

    public:
        __device__ deviceclass(float a, float b) {
            test = Plus(a,b);
        }

        __device__ float getvalue() {return test;}
};

// Device code
__global__ void VecInit(float* A, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < N) {
        deviceclass test(1.0, 2.0);

        A[i] = test.getvalue();
    }
}

// Standard CUDA guff below: Variables
float *h_A, *d_A;

// Host code
int main(int argc, char** argv)
{
    printf("Vector initialization...\n");
    int N = 10000;
    size_t size = N * sizeof(float);

    // Allocate
    h_A = (float*)malloc(size);
    cudaMalloc(&d_A, size);

    printf("Computing...\n");
    // Invoke kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
    VecInit<<<blocksPerGrid, threadsPerBlock>>>(d_A, N);

    // Copy result from device memory to host memory
    cudaMemcpy(h_A, d_A, size, cudaMemcpyDeviceToHost);



    // Verify result
    int i;
    for (i = 0; i < N; ++i) {
        cout << endl << h_A[i];
    }

    cout << endl;

    Cleanup();
}

void Cleanup(void)
{
    // Free device memory
    if (d_A)
        cudaFree(d_A);

    // Free host memory
    if (h_A)
        free(h_A);

    cudaThreadExit();

    exit(0);
}