Matlab 在mex中使用CubLassGemBatched时出错

Matlab 在mex中使用CubLassGemBatched时出错,matlab,cuda,mex,cublas,Matlab,Cuda,Mex,Cublas,我正在尝试使用mex文件中的CubLassGemBatched从matlab中乘法多个矩阵 我的matlab代码非常简单: gpuDevice(1); a = single(rand(400,10,1500,'gpuArray')); b = single(rand(10,12,1500,'gpuArray')); c = MatCuda(a,b) 我得到以下错误: 使用gpuArray/subsref时出错 CUDA执行期间发生意外错误。CUDA错误是: 未知错误 下面是MEX函数代码: v

我正在尝试使用mex文件中的CubLassGemBatched从matlab中乘法多个矩阵

我的matlab代码非常简单:

gpuDevice(1);
a = single(rand(400,10,1500,'gpuArray'));
b = single(rand(10,12,1500,'gpuArray'));
c = MatCuda(a,b)
我得到以下错误:

使用gpuArray/subsref时出错 CUDA执行期间发生意外错误。CUDA错误是: 未知错误

下面是MEX函数代码:

void mexFunction( int nlhs, mxArray *plhs[],
              int nrhs, const mxArray *prhs[]){

char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput";
char const * const errMsg = "Invalid input to MEX file.";

/* Declare all variables.*/
mxGPUArray const *A;
mxGPUArray const *B;
mxGPUArray *C;

/* Initialize the MathWorks GPU API. */
mxInitGPU();

/* Throw an error if the input is not a GPU array. */
if ((nrhs != 2) || !(mxIsGPUArray(prhs[0])) || !(mxIsGPUArray(prhs[1]))) {
    mexErrMsgIdAndTxt(errId, errMsg);
}

A = mxGPUCreateFromMxArray(prhs[0]);
B = mxGPUCreateFromMxArray(prhs[1]);

if ((mxGPUGetClassID(A) != mxSINGLE_CLASS) || (mxGPUGetClassID(B) != mxSINGLE_CLASS)) {
    mexErrMsgIdAndTxt(errId, errMsg);
}

float const *d_A;
float const *d_B;
d_A = (float const *)(mxGPUGetDataReadOnly(A));
d_B = (float const *)(mxGPUGetDataReadOnly(B));

const mwSize *dimsA = mxGPUGetDimensions(A);
size_t nrowsA = dimsA[0];
size_t ncolsA = dimsA[1];
size_t nMatricesA = dimsA[2];
mxFree((void*) dimsA);

const mwSize *dimsB = mxGPUGetDimensions(B);
size_t nrowsB = dimsB[0];
size_t ncolsB = dimsB[1];
size_t nMatricesB = dimsB[2];
mxFree((void*)dimsB);

size_t nrowsC = nrowsA;
size_t ncolsC = ncolsB;

mwSize dimsC[3] = { nrowsA, ncolsB, nMatricesB };
C = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
    dimsC,
    mxGPUGetClassID(A),
    mxGPUGetComplexity(A),
    MX_GPU_DO_NOT_INITIALIZE);

float *d_C;
d_C = (float *)(mxGPUGetData(C));

cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
    printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
    exit(EXIT_FAILURE);
}
const float alpha = 1.0f;
const float beta = 0.0f;
ret = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, nrowsA, ncolsB, ncolsA, &alpha, &d_A, nrowsA, &d_B, nrowsB, &beta, &d_C, nrowsC, nMatricesA);

if (ret != CUBLAS_STATUS_SUCCESS)
{
    printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
    exit(EXIT_FAILURE);
}

ret = cublasDestroy(handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
    printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
    exit(EXIT_FAILURE);
}

plhs[0] = mxGPUCreateMxArrayOnGPU(C);
mxGPUDestroyGPUArray(A);
mxGPUDestroyGPUArray(B);
mxGPUDestroyGPUArray(C);
}
我怀疑它与函数CubLassGemBatched有关,因为当我从代码中删除它时,我没有得到错误

非常感谢您的帮助!
谢谢

这里不需要MEX文件,您可以使用。另外,我建议直接在
单个
中构建
a
b
,而不是铸造。换句话说,

a = rand(400,10,1500,'single','gpuArray');
b = rand(10,12,1500,'single','gpuArray');
c = pagefun(@mtimes, a, b);

cublasDgemm为我工作。我只是将正则数组传递给mexfunction。下面是我的示例代码

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <time.h>
#include "mex.h"
#include "mxGPUArray.h"

typedef struct _matrixSize
{
    unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
} sMatrixSize;

void matrixMultiply(double const* d_A, double const* d_B, double* d_C, sMatrixSize &matrix_size);

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[])
{
    mxGPUArray const *A;
    mxGPUArray const *B;
    mxGPUArray *C;
    _matrixSize matrix_size;
    mwSize const *A_sz;
    mwSize const *B_sz;

    double const *d_A;
    double const *d_B;
    double *d_C;

    char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput";
    char const * const errMsg = "Invalid input to MEX file.";

    if (nrhs != 2) {
        mexErrMsgTxt("Need two inputs");
    }

    A = mxGPUCreateFromMxArray(prhs[0]);
    B = mxGPUCreateFromMxArray(prhs[1]);

    A_sz=mxGPUGetDimensions(A);
    B_sz = mxGPUGetDimensions(B);

    matrix_size.uiWA = (unsigned int)A_sz[0]; matrix_size.uiHA = (unsigned int)A_sz[1];
    matrix_size.uiWB = (unsigned int)B_sz[0]; matrix_size.uiHB = (unsigned int)B_sz[1];
    mwSize C_sz[3] = { matrix_size.uiWA, matrix_size.uiHB, 1 };

    d_A = (double const *)(mxGPUGetDataReadOnly(A));
    d_B = (double const *)(mxGPUGetDataReadOnly(B));

    C = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
        C_sz,
        mxGPUGetClassID(A),
        mxGPUGetComplexity(A),
        MX_GPU_DO_NOT_INITIALIZE);

    d_C = (double *)(mxGPUGetData(C));

    matrixMultiply(d_A, d_B, d_C, matrix_size);
    plhs[0] = mxGPUCreateMxArrayOnGPU(C);

    mxFree((void*)A_sz);
    mxFree((void*)B_sz);
    mxGPUDestroyGPUArray(A);
    mxGPUDestroyGPUArray(B);
    mxGPUDestroyGPUArray(C);
}

void matrixMultiply(double const* d_A, double const* d_B, double* d_C, sMatrixSize &matrix_size)
{
    cublasStatus_t status;
    cublasHandle_t handle;
    status=cublasCreate(&handle);
    if (status != CUBLAS_STATUS_SUCCESS)
    {
        if (status == CUBLAS_STATUS_NOT_INITIALIZED) {
            mexPrintf("CUBLAS initializing error");
        }
        getchar();
        return;
    }
    const double alpha = 1.0f;
    const double beta = 0.0f;
    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);
    cudaThreadSynchronize();
    cublasDestroy(handle);
}
#包括
#包括
#包括
#包括“mex.h”
#包括“mxGPUArray.h”
类型定义结构矩阵大小
{
未签署的国际uiWA、uiHA、uiWB、uiHB、uiWC、uiHC;
}sMatrixSize;
无效矩阵倍数(双常数*d_A、双常数*d_B、双*d_C、sMatrixSize和矩阵大小);
无效MEX函数(整数nlhs、mxArray*plhs[]、整数nrhs、mxArray const*prhs[])
{
mxGPUArray常数*A;
mxGPUArray常数*B;
mxGPUArray*C;
_矩阵大小矩阵大小;
mwSize const*A_sz;
mwSize const*B_sz;
双常数*d_A;
双常数*d_B;
双*d_C;
char const*const errId=“parallel:gpu:mexGPUExample:InvalidInput”;
char const*const errMsg=“MEX文件的输入无效。”;
如果(nrhs!=2){
mexErrMsgTxt(“需要两个输入”);
}
A=mxGPUCreateFromMxArray(prhs[0]);
B=mxGPUCreateFromMxArray(prhs[1]);
A_sz=mxGPUGetDimensions(A);
B_sz=mxGPUGetDimensions(B);
矩阵_size.uiWA=(无符号整数)A_sz[0];矩阵_size.uiHA=(无符号整数)A_sz[1];
矩阵_size.uiWB=(无符号整数)B_sz[0];矩阵_size.uiHB=(无符号整数)B_sz[1];
mwSize C_sz[3]={matrix_size.uiWA,matrix_size.uiHB,1};
d_A=(双常数*)(mxGPUGetDataReadOnly(A));
d_B=(双常数*)(mxGPUGetDataReadOnly(B));
C=mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
司徒,
mxGPUGetClassID(A),
mxGPUGetComplexity(A),
MX_GPU_不_初始化);
d_C=(双*)(mxGPUGetData(C));
矩阵倍数(d_A,d_B,d_C,矩阵大小);
plhs[0]=mxGPUCreateMxArrayOnGPU(C);
mxFree((void*)A_sz);
mxFree((void*)B_sz);
mxgpudestroygpu阵列(A);
mxgpudestroygpu阵列(B);
mxgpudestroygpu阵列(C);
}
无效矩阵倍数(双常数*d_A、双常数*d_B、双*d_C、sMatrixSize和矩阵大小)
{
立方体状态;
立方手柄;
状态=创建(&handle);
if(状态!=CUBLAS\u状态\u成功)
{
如果(状态==CUBLAS\u状态\u未初始化){
mexPrintf(“CUBLAS初始化错误”);
}
getchar();
返回;
}
常数双α=1.0f;
常数双贝塔=0.0f;
cublasDgemm(手柄、CUBLAS_OP_N、CUBLAS_OP_N、matrix_size.uiWB、matrix_size.uiHA、matrix_size.uiWA和alpha、d_B、matrix_size.uiWB、d_A、matrix_size.uiWA和beta、d_C、matrix_size.uiWB);
cudaThreadSynchronize();
立方体(把手);
}
还有一种更为土生土长的cuda风格。这两种方法对我都适用

#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <time.h>
#include "mex.h"
#include "mxGPUArray.h"

typedef struct _matrixSize
{
    unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
} sMatrixSize;

void matrixMultiply(double const* d_A, double const* d_B, double* d_C, sMatrixSize &matrix_size);

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[])
{
    mxArray const *mA;
    mxArray const *mB;

    _matrixSize matrix_size;
    size_t A_w, A_h, B_w, B_h;

    double *d_A;
    double *d_B;
    double *d_C;

    double *h_A;
    double *h_B;
    double *h_C;

    char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput";
    char const * const errMsg = "Invalid input to MEX file.";

    if (nrhs != 2) {
        mexErrMsgTxt("Need two inputs");
    }

    mA = prhs[0]; mB = prhs[1];
    A_w = mxGetM(mA);A_h = mxGetN(mA);B_w = mxGetM(mB);B_h = mxGetN(mB);

    matrix_size.uiWA = (unsigned int)A_w; matrix_size.uiHA = (unsigned int)A_h;
    matrix_size.uiWB = (unsigned int)B_w; matrix_size.uiHB = (unsigned int)B_h;
    matrix_size.uiWC = (unsigned int)A_w; matrix_size.uiHC = (unsigned int)B_h;

    mwSize const C_sz[3] = { matrix_size.uiWA, matrix_size.uiHB, 1 };

    unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
    unsigned int mem_size_A = sizeof(double) * size_A;
    h_A = (double*)mxGetData(mA);

    unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
    unsigned int mem_size_B = sizeof(double) * size_B;
    h_B = (double*)mxGetData(mB);

    unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
    unsigned int mem_size_C = sizeof(double) * size_C;

    plhs[0] = mxCreateNumericArray(3, C_sz, mxDOUBLE_CLASS, mxREAL);
    h_C = (double*)mxGetData(plhs[0]);

    cudaMalloc((void **)&d_A, mem_size_A);
    cudaMalloc((void **)&d_B, mem_size_B);
    cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
    cudaMalloc((void **)&d_C, mem_size_C);

    matrixMultiply(d_A, d_B, d_C, matrix_size);
    cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
    cudaThreadSynchronize();

    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);      
}


void matrixMultiply(double const* d_A, double const* d_B, double* d_C, sMatrixSize &matrix_size)
{
    cublasHandle_t handle;
    cublasCreate(&handle);
    const double alpha = 1.0f;
    const double beta = 0.0f;
    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);
    cublasDestroy(handle);
}
#包括
#包括
#包括
#包括“mex.h”
#包括“mxGPUArray.h”
类型定义结构矩阵大小
{
未签署的国际uiWA、uiHA、uiWB、uiHB、uiWC、uiHC;
}sMatrixSize;
无效矩阵倍数(双常数*d_A、双常数*d_B、双*d_C、sMatrixSize和矩阵大小);
无效MEX函数(整数nlhs、mxArray*plhs[]、整数nrhs、mxArray const*prhs[])
{
mx阵列常数*mA;
MX阵列常数*mB;
_矩阵大小矩阵大小;
尺寸A_w,A_h,B_w,B_h;
双d_A;
双*d_B;
双*d_C;
双*h_A;
双*h_B;
双*h_C;
char const*const errId=“parallel:gpu:mexGPUExample:InvalidInput”;
char const*const errMsg=“MEX文件的输入无效。”;
如果(nrhs!=2){
mexErrMsgTxt(“需要两个输入”);
}
mA=prhs[0];mB=prhs[1];
A_w=mxGetM(mA);A_h=mxGetN(mA);B_w=mxGetM(mB);B_h=mxGetN(mB);
矩阵_size.uiWA=(无符号整数)A_w;矩阵_size.uiHA=(无符号整数)A_h;
矩阵_size.uiWB=(无符号整数)B_w;矩阵_size.uiHB=(无符号整数)B_h;
矩阵_size.uiWC=(无符号整数)A_w;矩阵_size.uiHC=(无符号整数)B_h;
mwSize const C_sz[3]={matrix_size.uiWA,matrix_size.uiHB,1};
无符号整数大小=矩阵大小.uiWA*矩阵大小.uiHA;
无符号整数mem_size_A=sizeof(double)*size_A;
h_A=(双*)mxGetData(mA);
无符号整数大小=矩阵大小.uiWB*矩阵大小.uiHB;
无符号整数mem_size_B=sizeof(double)*size_B;
h_B=(双*)mxGetData(mB);
无符号整数大小=矩阵大小.uiWC*矩阵大小.uiHC;
无符号整数mem_size_C=sizeof(double)*size_C;
plhs[0]=mxCreateNumericArray(3,C_sz,mxDOUBLE_类,mxREAL);
h_C=(双*)mxGetData(plhs[0]);
Cudamaloc((空白**)和d_A、mem_大小_A);
Cudamaloc((无效**)和d_B、mem_尺寸_B);
cudaMemcpy(d_A、h_A、mem_size_A、cudamemcpyhostodevice);
cudaMemcpy(d_B,h_B,mem_size_B,cudamemcpyhostodevice);
Cudamaloc((void**)和d_C、mem_size_C);
矩阵倍数(d_A,d_B,d_C,矩阵大小);
cudaMemcpy(h_C、d_C、mem_size_C、cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
库达弗里(杜阿);
库达弗里(杜布);
库达弗里(d_C);
}
无效矩阵倍数(双常数*d_A、双常数*d_B、双*d_C、sMatrixSize和矩阵大小)
{
立方手柄;
cublasCreate(&handle);
常数双α=1.0f;
常数双贝塔=0.0f;
cublasDgemm(手柄、CUBLAS_OP_N、CUBLAS_OP_N、matrix_size.uiWB、matrix_size.uiHA、matrix_size.uiWA和alpha、d_B、matrix_size.uiWB、d_A、matrix_size.uiWA和beta、d_C、matrix_size.uiWB);
立方体(把手);
}

Gemmbatch比大多数cublas更复杂