通过设备功能CUDA中的引用传递

通过设备功能CUDA中的引用传递,cuda,parameter-passing,Cuda,Parameter Passing,我想在CUDA中通过引用传递函数,并在mex文件Matlab中传递函数,我调用了add_func,它返回多个变量,我通过指针传递,但在执行代码时出现了问题。请看一下我的代码,给我一些预付款 #include <stdio.h> #include "mex.h" #include "matrix.h" #include "gpu/mxGPUArray.h" typedef void(*op_func_t) (double, double, double*, double*); typ

我想在CUDA中通过引用传递函数,并在mex文件Matlab中传递函数,我调用了add_func,它返回多个变量,我通过指针传递,但在执行代码时出现了问题。请看一下我的代码,给我一些预付款

#include <stdio.h>
#include "mex.h"
#include "matrix.h"
#include "gpu/mxGPUArray.h"

typedef void(*op_func_t) (double, double, double*, double*);
typedef void(*my_func_t) (double, double, double, double, void (*func)(double, double, double *, double *));

__device__ void add_func(double x, double y, double *z, double *k)

{
    *z= x + y;
    *k= x + y;
}
__device__ void mul_func(double x, double y, double *z, double *k)

{   
    *z= x * y;
    *k= x * y;

}
__device__ void my_func(double x, double y, double z, double k, void (*func)(double, double, double *, double *))
{
    (*func)(x, y, &z, &k);
}


// Static pointers to device functions

__device__ op_func_t p_add_func = add_func;

__device__ op_func_t p_mul_func = mul_func;

__device__ my_func_t p_my_func = my_func;


__global__ void kernel(double const * const x, double const * const y,double * const u,double * const v, int const N, op_func_t op, op_func_t op1, my_func_t op2)

{
    int const i = blockDim.x * blockIdx.x + threadIdx.x;

        if (i<5)
        {
            (*op2)(x[i], y[i], u[i], v[i], op1);
        }
        else   
        {
            v[i]=10;
            u[i]=8;
        }
    __syncthreads();// wait for each thread to copy its elemenet
}

//host code

void mexFunction(int nlhs,mxArray *plhs[], int nrhs, const mxArray *prhs[]) {

    /* Declare all variables.*/
    mxGPUArray const *A;
    mxGPUArray const *C;
    mxGPUArray *B;
    mxGPUArray *D;
    double const *d_A;
    double const *d_C;
    double *d_B;
    double *d_D;
    int N;

    /* Choose a reasonably sized number of threads for the block. */
    int const threadsPerBlock = 256;
    int blocksPerGrid;

    /* Initialize the MathWorks GPU API. */
    mxInitGPU();

    A = mxGPUCreateFromMxArray(prhs[0]);
    C = mxGPUCreateFromMxArray(prhs[1]);

    /*
     * Now that we have verified the data type, extract a pointer to the input
     * data on the device.
     */
    d_A = (double const *)(mxGPUGetDataReadOnly(A));
    d_C = (double const *)(mxGPUGetDataReadOnly(C));

    /* Create a GPUArray to hold the result and get its underlying pointer. */
    B = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
                            mxGPUGetDimensions(A),
                            mxGPUGetClassID(A),
                            mxGPUGetComplexity(A),
                            MX_GPU_DO_NOT_INITIALIZE);
    D = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
                            mxGPUGetDimensions(A),
                            mxGPUGetClassID(A),
                            mxGPUGetComplexity(A),
                            MX_GPU_DO_NOT_INITIALIZE);
    d_B = (double *)(mxGPUGetData(B));
    d_D = (double *)(mxGPUGetData(D));

    /*
     * Call the kernel using the CUDA runtime API. We are using a 1-d grid here,
     * and it would be possible for the number of elements to be too large for
     * the grid. For this example we are not guarding against this possibility.
     */
    N = (int)(mxGPUGetNumberOfElements(A));
    blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    op_func_t h_add_func;

    op_func_t h_mul_func;

    my_func_t h_my_func;


    // Copy device function pointer to host side
    cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));

    cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));

    cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));

    op_func_t d_myfunc = h_mul_func;
    op_func_t d_myfunc1 = h_add_func;
    my_func_t d_myfunc2 = h_my_func; 

    kernel <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, d_D ,d_B, N, d_myfunc, d_myfunc1, d_myfunc2);

    /* Wrap the result up as a MATLAB gpuArray for return. */
    plhs[0] = mxGPUCreateMxArrayOnGPU(B);
    plhs[1] = mxGPUCreateMxArrayOnGPU(D);
    /*
     * The mxGPUArray pointers are host-side structures that refer to device
     * data. These must be destroyed before leaving the MEX function.
     */
    mxGPUDestroyGPUArray(A);
    mxGPUDestroyGPUArray(B);
    mxGPUDestroyGPUArray(C);
    mxGPUDestroyGPUArray(D);

    return;

}
a=

b=

一些初始化代码:

// Static pointers to device functions

    __device__ op_func_t p_add_func = add_func;

    __device__ op_func_t p_mul_func = mul_func;

    __device__ my_func_t p_my_func = my_func;

        op_func_t h_add_func;

        op_func_t h_mul_func;

        my_func_t h_my_func;


        // Copy device function pointer to host side
        cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));

        cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));

        cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));

        op_func_t d_myfunc = h_mul_func;
        op_func_t d_myfunc1 = h_add_func;
        my_func_t d_myfunc2 = h_my_func; 

这里有很多问题。这:

__device__ void my_func(double x, double y, double z, double k, void (*func)(double, double, double *, double *))
{
    (*func)(x, y, &z, &k);
}
它坏了。修改后的z和k值将永远不会返回给调用者,因为传递值。您可以通过引用(CUDA是基于C++的,并且在.x Deice函数中支持引用): 第二个问题是内核中缺少边界检查,当
N
不是
threadsPerBlock
的精确倍数时,这将导致越界内存访问

当我在您的代码中修复这两个问题时,如下所示:

#include <thrust/device_vector.h>
#include <iostream>

typedef void(*op_func_t) (double, double, double&, double&);
typedef void(*my_func_t) (double, double, double&, double&, void (*func)(double, double, double &, double &));

__device__ void add_func(double x, double y, double& z, double& k)

{
    z= x + y;
    k= x + y;
}
__device__ void mul_func(double x, double y, double& z, double& k)

{   
    z= x * y;
    k= x * y;

}
__device__ void my_func(double x, double y, double& z, double& k, void (*func)(double, double, double&, double&))
{
    (*func)(x, y, z, k);
}


// Static pointers to device functions

__device__ op_func_t p_add_func = add_func;
__device__ op_func_t p_mul_func = mul_func;
__device__ my_func_t p_my_func = my_func;

__global__ void kernel(double const * const x, double const * const y,double * const u,double * const v, int const N, op_func_t op, op_func_t op1, my_func_t op2)
{
    int const i = blockDim.x * blockIdx.x + threadIdx.x;

        if (i<5) {
            (*op2)(x[i], y[i], u[i], v[i], op1);
        } else if (i<N)   {
            v[i]=10;
            u[i]=8;
        }
}

//host code

int main()
{
    const size_t n = 5;
    const size_t N = n * n;

    /* Declare all variables.*/
    thrust::device_vector<double> A(N, 1.0);
    thrust::device_vector<double> C(N, 1.0);
    thrust::device_vector<double> B(N);
    thrust::device_vector<double> D(N);
    double *d_A = thrust::raw_pointer_cast(A.data()); 
    double *d_C = thrust::raw_pointer_cast(C.data());  
    double *d_B = thrust::raw_pointer_cast(B.data()); 
    double *d_D = thrust::raw_pointer_cast(D.data()); 


    /* Choose a reasonably sized number of threads for the block. */
    int const threadsPerBlock = 256;
    int blocksPerGrid;

    blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    op_func_t h_add_func;
    op_func_t h_mul_func;
    my_func_t h_my_func;

    // Copy device function pointer to host side
    cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));
    cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));
    cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));

    op_func_t d_myfunc = h_mul_func;
    op_func_t d_myfunc1 = h_add_func;
    my_func_t d_myfunc2 = h_my_func; 

    kernel <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, d_D ,d_B, N, d_myfunc, d_myfunc1, d_myfunc2);
    cudaDeviceSynchronize();

    for(const auto bval: B)
        std::cout << bval << std::endl;

    for(const auto dval: D)
        std::cout << dval << std::endl;

    return 0;
}

有什么问题?不清楚您希望得到什么结果。什么是
op_func_t
my_func_t
?您在哪里初始化所有内核参数?谢谢。我正在初始化主机代码。但处理了设备编码中的所有指针。您能提供一个输入数据的示例吗?
// Static pointers to device functions

    __device__ op_func_t p_add_func = add_func;

    __device__ op_func_t p_mul_func = mul_func;

    __device__ my_func_t p_my_func = my_func;

        op_func_t h_add_func;

        op_func_t h_mul_func;

        my_func_t h_my_func;


        // Copy device function pointer to host side
        cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));

        cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));

        cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));

        op_func_t d_myfunc = h_mul_func;
        op_func_t d_myfunc1 = h_add_func;
        my_func_t d_myfunc2 = h_my_func; 
__device__ void my_func(double x, double y, double z, double k, void (*func)(double, double, double *, double *))
{
    (*func)(x, y, &z, &k);
}
typedef void(*op_func_t) (double, double, double&, double&);
typedef void(*my_func_t) (double, double, double&, double&, void (*func)(double, double, double &, double &));

__device__ void add_func(double x, double y, double& z, double& k)

{
    z= x + y;
    k= x + y;
}
__device__ void mul_func(double x, double y, double& z, double& k)

{   
    z= x * y;
    k= x * y;

}
__device__ void my_func(double x, double y, double& z, double& k, void (*func)(double, double, double&, double&))
{
    (*func)(x, y, z, k);
}
#include <thrust/device_vector.h>
#include <iostream>

typedef void(*op_func_t) (double, double, double&, double&);
typedef void(*my_func_t) (double, double, double&, double&, void (*func)(double, double, double &, double &));

__device__ void add_func(double x, double y, double& z, double& k)

{
    z= x + y;
    k= x + y;
}
__device__ void mul_func(double x, double y, double& z, double& k)

{   
    z= x * y;
    k= x * y;

}
__device__ void my_func(double x, double y, double& z, double& k, void (*func)(double, double, double&, double&))
{
    (*func)(x, y, z, k);
}


// Static pointers to device functions

__device__ op_func_t p_add_func = add_func;
__device__ op_func_t p_mul_func = mul_func;
__device__ my_func_t p_my_func = my_func;

__global__ void kernel(double const * const x, double const * const y,double * const u,double * const v, int const N, op_func_t op, op_func_t op1, my_func_t op2)
{
    int const i = blockDim.x * blockIdx.x + threadIdx.x;

        if (i<5) {
            (*op2)(x[i], y[i], u[i], v[i], op1);
        } else if (i<N)   {
            v[i]=10;
            u[i]=8;
        }
}

//host code

int main()
{
    const size_t n = 5;
    const size_t N = n * n;

    /* Declare all variables.*/
    thrust::device_vector<double> A(N, 1.0);
    thrust::device_vector<double> C(N, 1.0);
    thrust::device_vector<double> B(N);
    thrust::device_vector<double> D(N);
    double *d_A = thrust::raw_pointer_cast(A.data()); 
    double *d_C = thrust::raw_pointer_cast(C.data());  
    double *d_B = thrust::raw_pointer_cast(B.data()); 
    double *d_D = thrust::raw_pointer_cast(D.data()); 


    /* Choose a reasonably sized number of threads for the block. */
    int const threadsPerBlock = 256;
    int blocksPerGrid;

    blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;

    op_func_t h_add_func;
    op_func_t h_mul_func;
    my_func_t h_my_func;

    // Copy device function pointer to host side
    cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));
    cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));
    cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));

    op_func_t d_myfunc = h_mul_func;
    op_func_t d_myfunc1 = h_add_func;
    my_func_t d_myfunc2 = h_my_func; 

    kernel <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, d_D ,d_B, N, d_myfunc, d_myfunc1, d_myfunc2);
    cudaDeviceSynchronize();

    for(const auto bval: B)
        std::cout << bval << std::endl;

    for(const auto dval: D)
        std::cout << dval << std::endl;

    return 0;
}
$ nvcc -o mexhead -std=c++11 -arch=sm_52 mexhead.cu
$ cuda-memcheck ./mexhead 
========= CUDA-MEMCHECK
2
2
2
2
2
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
2
2
2
2
2
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
========= ERROR SUMMARY: 0 errors