通过设备功能CUDA中的引用传递
我想在CUDA中通过引用传递函数,并在mex文件Matlab中传递函数,我调用了add_func,它返回多个变量,我通过指针传递,但在执行代码时出现了问题。请看一下我的代码,给我一些预付款通过设备功能CUDA中的引用传递,cuda,parameter-passing,Cuda,Parameter Passing,我想在CUDA中通过引用传递函数,并在mex文件Matlab中传递函数,我调用了add_func,它返回多个变量,我通过指针传递,但在执行代码时出现了问题。请看一下我的代码,给我一些预付款 #include <stdio.h> #include "mex.h" #include "matrix.h" #include "gpu/mxGPUArray.h" typedef void(*op_func_t) (double, double, double*, double*); typ
#include <stdio.h>
#include "mex.h"
#include "matrix.h"
#include "gpu/mxGPUArray.h"
typedef void(*op_func_t) (double, double, double*, double*);
typedef void(*my_func_t) (double, double, double, double, void (*func)(double, double, double *, double *));
__device__ void add_func(double x, double y, double *z, double *k)
{
*z= x + y;
*k= x + y;
}
__device__ void mul_func(double x, double y, double *z, double *k)
{
*z= x * y;
*k= x * y;
}
__device__ void my_func(double x, double y, double z, double k, void (*func)(double, double, double *, double *))
{
(*func)(x, y, &z, &k);
}
// Static pointers to device functions
__device__ op_func_t p_add_func = add_func;
__device__ op_func_t p_mul_func = mul_func;
__device__ my_func_t p_my_func = my_func;
__global__ void kernel(double const * const x, double const * const y,double * const u,double * const v, int const N, op_func_t op, op_func_t op1, my_func_t op2)
{
int const i = blockDim.x * blockIdx.x + threadIdx.x;
if (i<5)
{
(*op2)(x[i], y[i], u[i], v[i], op1);
}
else
{
v[i]=10;
u[i]=8;
}
__syncthreads();// wait for each thread to copy its elemenet
}
//host code
void mexFunction(int nlhs,mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
/* Declare all variables.*/
mxGPUArray const *A;
mxGPUArray const *C;
mxGPUArray *B;
mxGPUArray *D;
double const *d_A;
double const *d_C;
double *d_B;
double *d_D;
int N;
/* Choose a reasonably sized number of threads for the block. */
int const threadsPerBlock = 256;
int blocksPerGrid;
/* Initialize the MathWorks GPU API. */
mxInitGPU();
A = mxGPUCreateFromMxArray(prhs[0]);
C = mxGPUCreateFromMxArray(prhs[1]);
/*
* Now that we have verified the data type, extract a pointer to the input
* data on the device.
*/
d_A = (double const *)(mxGPUGetDataReadOnly(A));
d_C = (double const *)(mxGPUGetDataReadOnly(C));
/* Create a GPUArray to hold the result and get its underlying pointer. */
B = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
mxGPUGetDimensions(A),
mxGPUGetClassID(A),
mxGPUGetComplexity(A),
MX_GPU_DO_NOT_INITIALIZE);
D = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
mxGPUGetDimensions(A),
mxGPUGetClassID(A),
mxGPUGetComplexity(A),
MX_GPU_DO_NOT_INITIALIZE);
d_B = (double *)(mxGPUGetData(B));
d_D = (double *)(mxGPUGetData(D));
/*
* Call the kernel using the CUDA runtime API. We are using a 1-d grid here,
* and it would be possible for the number of elements to be too large for
* the grid. For this example we are not guarding against this possibility.
*/
N = (int)(mxGPUGetNumberOfElements(A));
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
op_func_t h_add_func;
op_func_t h_mul_func;
my_func_t h_my_func;
// Copy device function pointer to host side
cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));
op_func_t d_myfunc = h_mul_func;
op_func_t d_myfunc1 = h_add_func;
my_func_t d_myfunc2 = h_my_func;
kernel <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, d_D ,d_B, N, d_myfunc, d_myfunc1, d_myfunc2);
/* Wrap the result up as a MATLAB gpuArray for return. */
plhs[0] = mxGPUCreateMxArrayOnGPU(B);
plhs[1] = mxGPUCreateMxArrayOnGPU(D);
/*
* The mxGPUArray pointers are host-side structures that refer to device
* data. These must be destroyed before leaving the MEX function.
*/
mxGPUDestroyGPUArray(A);
mxGPUDestroyGPUArray(B);
mxGPUDestroyGPUArray(C);
mxGPUDestroyGPUArray(D);
return;
}
a=
b=
一些初始化代码:
// Static pointers to device functions
__device__ op_func_t p_add_func = add_func;
__device__ op_func_t p_mul_func = mul_func;
__device__ my_func_t p_my_func = my_func;
op_func_t h_add_func;
op_func_t h_mul_func;
my_func_t h_my_func;
// Copy device function pointer to host side
cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));
op_func_t d_myfunc = h_mul_func;
op_func_t d_myfunc1 = h_add_func;
my_func_t d_myfunc2 = h_my_func;
这里有很多问题。这:
__device__ void my_func(double x, double y, double z, double k, void (*func)(double, double, double *, double *))
{
(*func)(x, y, &z, &k);
}
它坏了。修改后的z和k值将永远不会返回给调用者,因为传递值。您可以通过引用(CUDA是基于C++的,并且在.x Deice函数中支持引用):
第二个问题是内核中缺少边界检查,当N
不是threadsPerBlock
的精确倍数时,这将导致越界内存访问
当我在您的代码中修复这两个问题时,如下所示:
#include <thrust/device_vector.h>
#include <iostream>
typedef void(*op_func_t) (double, double, double&, double&);
typedef void(*my_func_t) (double, double, double&, double&, void (*func)(double, double, double &, double &));
__device__ void add_func(double x, double y, double& z, double& k)
{
z= x + y;
k= x + y;
}
__device__ void mul_func(double x, double y, double& z, double& k)
{
z= x * y;
k= x * y;
}
__device__ void my_func(double x, double y, double& z, double& k, void (*func)(double, double, double&, double&))
{
(*func)(x, y, z, k);
}
// Static pointers to device functions
__device__ op_func_t p_add_func = add_func;
__device__ op_func_t p_mul_func = mul_func;
__device__ my_func_t p_my_func = my_func;
__global__ void kernel(double const * const x, double const * const y,double * const u,double * const v, int const N, op_func_t op, op_func_t op1, my_func_t op2)
{
int const i = blockDim.x * blockIdx.x + threadIdx.x;
if (i<5) {
(*op2)(x[i], y[i], u[i], v[i], op1);
} else if (i<N) {
v[i]=10;
u[i]=8;
}
}
//host code
int main()
{
const size_t n = 5;
const size_t N = n * n;
/* Declare all variables.*/
thrust::device_vector<double> A(N, 1.0);
thrust::device_vector<double> C(N, 1.0);
thrust::device_vector<double> B(N);
thrust::device_vector<double> D(N);
double *d_A = thrust::raw_pointer_cast(A.data());
double *d_C = thrust::raw_pointer_cast(C.data());
double *d_B = thrust::raw_pointer_cast(B.data());
double *d_D = thrust::raw_pointer_cast(D.data());
/* Choose a reasonably sized number of threads for the block. */
int const threadsPerBlock = 256;
int blocksPerGrid;
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
op_func_t h_add_func;
op_func_t h_mul_func;
my_func_t h_my_func;
// Copy device function pointer to host side
cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));
op_func_t d_myfunc = h_mul_func;
op_func_t d_myfunc1 = h_add_func;
my_func_t d_myfunc2 = h_my_func;
kernel <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, d_D ,d_B, N, d_myfunc, d_myfunc1, d_myfunc2);
cudaDeviceSynchronize();
for(const auto bval: B)
std::cout << bval << std::endl;
for(const auto dval: D)
std::cout << dval << std::endl;
return 0;
}
有什么问题?不清楚您希望得到什么结果。什么是
op_func_t
和my_func_t
?您在哪里初始化所有内核参数?谢谢。我正在初始化主机代码。但处理了设备编码中的所有指针。您能提供一个输入数据的示例吗?
// Static pointers to device functions
__device__ op_func_t p_add_func = add_func;
__device__ op_func_t p_mul_func = mul_func;
__device__ my_func_t p_my_func = my_func;
op_func_t h_add_func;
op_func_t h_mul_func;
my_func_t h_my_func;
// Copy device function pointer to host side
cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));
op_func_t d_myfunc = h_mul_func;
op_func_t d_myfunc1 = h_add_func;
my_func_t d_myfunc2 = h_my_func;
__device__ void my_func(double x, double y, double z, double k, void (*func)(double, double, double *, double *))
{
(*func)(x, y, &z, &k);
}
typedef void(*op_func_t) (double, double, double&, double&);
typedef void(*my_func_t) (double, double, double&, double&, void (*func)(double, double, double &, double &));
__device__ void add_func(double x, double y, double& z, double& k)
{
z= x + y;
k= x + y;
}
__device__ void mul_func(double x, double y, double& z, double& k)
{
z= x * y;
k= x * y;
}
__device__ void my_func(double x, double y, double& z, double& k, void (*func)(double, double, double&, double&))
{
(*func)(x, y, z, k);
}
#include <thrust/device_vector.h>
#include <iostream>
typedef void(*op_func_t) (double, double, double&, double&);
typedef void(*my_func_t) (double, double, double&, double&, void (*func)(double, double, double &, double &));
__device__ void add_func(double x, double y, double& z, double& k)
{
z= x + y;
k= x + y;
}
__device__ void mul_func(double x, double y, double& z, double& k)
{
z= x * y;
k= x * y;
}
__device__ void my_func(double x, double y, double& z, double& k, void (*func)(double, double, double&, double&))
{
(*func)(x, y, z, k);
}
// Static pointers to device functions
__device__ op_func_t p_add_func = add_func;
__device__ op_func_t p_mul_func = mul_func;
__device__ my_func_t p_my_func = my_func;
__global__ void kernel(double const * const x, double const * const y,double * const u,double * const v, int const N, op_func_t op, op_func_t op1, my_func_t op2)
{
int const i = blockDim.x * blockIdx.x + threadIdx.x;
if (i<5) {
(*op2)(x[i], y[i], u[i], v[i], op1);
} else if (i<N) {
v[i]=10;
u[i]=8;
}
}
//host code
int main()
{
const size_t n = 5;
const size_t N = n * n;
/* Declare all variables.*/
thrust::device_vector<double> A(N, 1.0);
thrust::device_vector<double> C(N, 1.0);
thrust::device_vector<double> B(N);
thrust::device_vector<double> D(N);
double *d_A = thrust::raw_pointer_cast(A.data());
double *d_C = thrust::raw_pointer_cast(C.data());
double *d_B = thrust::raw_pointer_cast(B.data());
double *d_D = thrust::raw_pointer_cast(D.data());
/* Choose a reasonably sized number of threads for the block. */
int const threadsPerBlock = 256;
int blocksPerGrid;
blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
op_func_t h_add_func;
op_func_t h_mul_func;
my_func_t h_my_func;
// Copy device function pointer to host side
cudaMemcpyFromSymbol(&h_mul_func, p_mul_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_add_func, p_add_func, sizeof(op_func_t));
cudaMemcpyFromSymbol(&h_my_func, p_my_func, sizeof(my_func_t));
op_func_t d_myfunc = h_mul_func;
op_func_t d_myfunc1 = h_add_func;
my_func_t d_myfunc2 = h_my_func;
kernel <<<blocksPerGrid, threadsPerBlock>>>(d_A, d_C, d_D ,d_B, N, d_myfunc, d_myfunc1, d_myfunc2);
cudaDeviceSynchronize();
for(const auto bval: B)
std::cout << bval << std::endl;
for(const auto dval: D)
std::cout << dval << std::endl;
return 0;
}
$ nvcc -o mexhead -std=c++11 -arch=sm_52 mexhead.cu
$ cuda-memcheck ./mexhead
========= CUDA-MEMCHECK
2
2
2
2
2
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
2
2
2
2
2
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
========= ERROR SUMMARY: 0 errors