Matlab 在mex中使用CubLassGemBatched时出错
我正在尝试使用mex文件中的CubLassGemBatched从matlab中乘法多个矩阵 我的matlab代码非常简单:Matlab 在mex中使用CubLassGemBatched时出错,matlab,cuda,mex,cublas,Matlab,Cuda,Mex,Cublas,我正在尝试使用mex文件中的CubLassGemBatched从matlab中乘法多个矩阵 我的matlab代码非常简单: gpuDevice(1); a = single(rand(400,10,1500,'gpuArray')); b = single(rand(10,12,1500,'gpuArray')); c = MatCuda(a,b) 我得到以下错误: 使用gpuArray/subsref时出错 CUDA执行期间发生意外错误。CUDA错误是: 未知错误 下面是MEX函数代码: v
gpuDevice(1);
a = single(rand(400,10,1500,'gpuArray'));
b = single(rand(10,12,1500,'gpuArray'));
c = MatCuda(a,b)
我得到以下错误:
使用gpuArray/subsref时出错
CUDA执行期间发生意外错误。CUDA错误是:
未知错误
下面是MEX函数代码:
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[]){
char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput";
char const * const errMsg = "Invalid input to MEX file.";
/* Declare all variables.*/
mxGPUArray const *A;
mxGPUArray const *B;
mxGPUArray *C;
/* Initialize the MathWorks GPU API. */
mxInitGPU();
/* Throw an error if the input is not a GPU array. */
if ((nrhs != 2) || !(mxIsGPUArray(prhs[0])) || !(mxIsGPUArray(prhs[1]))) {
mexErrMsgIdAndTxt(errId, errMsg);
}
A = mxGPUCreateFromMxArray(prhs[0]);
B = mxGPUCreateFromMxArray(prhs[1]);
if ((mxGPUGetClassID(A) != mxSINGLE_CLASS) || (mxGPUGetClassID(B) != mxSINGLE_CLASS)) {
mexErrMsgIdAndTxt(errId, errMsg);
}
float const *d_A;
float const *d_B;
d_A = (float const *)(mxGPUGetDataReadOnly(A));
d_B = (float const *)(mxGPUGetDataReadOnly(B));
const mwSize *dimsA = mxGPUGetDimensions(A);
size_t nrowsA = dimsA[0];
size_t ncolsA = dimsA[1];
size_t nMatricesA = dimsA[2];
mxFree((void*) dimsA);
const mwSize *dimsB = mxGPUGetDimensions(B);
size_t nrowsB = dimsB[0];
size_t ncolsB = dimsB[1];
size_t nMatricesB = dimsB[2];
mxFree((void*)dimsB);
size_t nrowsC = nrowsA;
size_t ncolsC = ncolsB;
mwSize dimsC[3] = { nrowsA, ncolsB, nMatricesB };
C = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
dimsC,
mxGPUGetClassID(A),
mxGPUGetComplexity(A),
MX_GPU_DO_NOT_INITIALIZE);
float *d_C;
d_C = (float *)(mxGPUGetData(C));
cublasHandle_t handle;
cublasStatus_t ret;
ret = cublasCreate(&handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
const float alpha = 1.0f;
const float beta = 0.0f;
ret = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, nrowsA, ncolsB, ncolsA, &alpha, &d_A, nrowsA, &d_B, nrowsB, &beta, &d_C, nrowsC, nMatricesA);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
ret = cublasDestroy(handle);
if (ret != CUBLAS_STATUS_SUCCESS)
{
printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__);
exit(EXIT_FAILURE);
}
plhs[0] = mxGPUCreateMxArrayOnGPU(C);
mxGPUDestroyGPUArray(A);
mxGPUDestroyGPUArray(B);
mxGPUDestroyGPUArray(C);
}
我怀疑它与函数CubLassGemBatched有关,因为当我从代码中删除它时,我没有得到错误
非常感谢您的帮助!
谢谢 这里不需要MEX文件,您可以使用。另外,我建议直接在
单个中构建a
和b
,而不是铸造。换句话说,
a = rand(400,10,1500,'single','gpuArray');
b = rand(10,12,1500,'single','gpuArray');
c = pagefun(@mtimes, a, b);
cublasDgemm为我工作。我只是将正则数组传递给mexfunction。下面是我的示例代码
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <time.h>
#include "mex.h"
#include "mxGPUArray.h"
typedef struct _matrixSize
{
unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
} sMatrixSize;
void matrixMultiply(double const* d_A, double const* d_B, double* d_C, sMatrixSize &matrix_size);
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[])
{
mxGPUArray const *A;
mxGPUArray const *B;
mxGPUArray *C;
_matrixSize matrix_size;
mwSize const *A_sz;
mwSize const *B_sz;
double const *d_A;
double const *d_B;
double *d_C;
char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput";
char const * const errMsg = "Invalid input to MEX file.";
if (nrhs != 2) {
mexErrMsgTxt("Need two inputs");
}
A = mxGPUCreateFromMxArray(prhs[0]);
B = mxGPUCreateFromMxArray(prhs[1]);
A_sz=mxGPUGetDimensions(A);
B_sz = mxGPUGetDimensions(B);
matrix_size.uiWA = (unsigned int)A_sz[0]; matrix_size.uiHA = (unsigned int)A_sz[1];
matrix_size.uiWB = (unsigned int)B_sz[0]; matrix_size.uiHB = (unsigned int)B_sz[1];
mwSize C_sz[3] = { matrix_size.uiWA, matrix_size.uiHB, 1 };
d_A = (double const *)(mxGPUGetDataReadOnly(A));
d_B = (double const *)(mxGPUGetDataReadOnly(B));
C = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
C_sz,
mxGPUGetClassID(A),
mxGPUGetComplexity(A),
MX_GPU_DO_NOT_INITIALIZE);
d_C = (double *)(mxGPUGetData(C));
matrixMultiply(d_A, d_B, d_C, matrix_size);
plhs[0] = mxGPUCreateMxArrayOnGPU(C);
mxFree((void*)A_sz);
mxFree((void*)B_sz);
mxGPUDestroyGPUArray(A);
mxGPUDestroyGPUArray(B);
mxGPUDestroyGPUArray(C);
}
void matrixMultiply(double const* d_A, double const* d_B, double* d_C, sMatrixSize &matrix_size)
{
cublasStatus_t status;
cublasHandle_t handle;
status=cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
if (status == CUBLAS_STATUS_NOT_INITIALIZED) {
mexPrintf("CUBLAS initializing error");
}
getchar();
return;
}
const double alpha = 1.0f;
const double beta = 0.0f;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);
cudaThreadSynchronize();
cublasDestroy(handle);
}
#包括
#包括
#包括
#包括“mex.h”
#包括“mxGPUArray.h”
类型定义结构矩阵大小
{
未签署的国际uiWA、uiHA、uiWB、uiHB、uiWC、uiHC;
}sMatrixSize;
无效矩阵倍数(双常数*d_A、双常数*d_B、双*d_C、sMatrixSize和矩阵大小);
无效MEX函数(整数nlhs、mxArray*plhs[]、整数nrhs、mxArray const*prhs[])
{
mxGPUArray常数*A;
mxGPUArray常数*B;
mxGPUArray*C;
_矩阵大小矩阵大小;
mwSize const*A_sz;
mwSize const*B_sz;
双常数*d_A;
双常数*d_B;
双*d_C;
char const*const errId=“parallel:gpu:mexGPUExample:InvalidInput”;
char const*const errMsg=“MEX文件的输入无效。”;
如果(nrhs!=2){
mexErrMsgTxt(“需要两个输入”);
}
A=mxGPUCreateFromMxArray(prhs[0]);
B=mxGPUCreateFromMxArray(prhs[1]);
A_sz=mxGPUGetDimensions(A);
B_sz=mxGPUGetDimensions(B);
矩阵_size.uiWA=(无符号整数)A_sz[0];矩阵_size.uiHA=(无符号整数)A_sz[1];
矩阵_size.uiWB=(无符号整数)B_sz[0];矩阵_size.uiHB=(无符号整数)B_sz[1];
mwSize C_sz[3]={matrix_size.uiWA,matrix_size.uiHB,1};
d_A=(双常数*)(mxGPUGetDataReadOnly(A));
d_B=(双常数*)(mxGPUGetDataReadOnly(B));
C=mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A),
司徒,
mxGPUGetClassID(A),
mxGPUGetComplexity(A),
MX_GPU_不_初始化);
d_C=(双*)(mxGPUGetData(C));
矩阵倍数(d_A,d_B,d_C,矩阵大小);
plhs[0]=mxGPUCreateMxArrayOnGPU(C);
mxFree((void*)A_sz);
mxFree((void*)B_sz);
mxgpudestroygpu阵列(A);
mxgpudestroygpu阵列(B);
mxgpudestroygpu阵列(C);
}
无效矩阵倍数(双常数*d_A、双常数*d_B、双*d_C、sMatrixSize和矩阵大小)
{
立方体状态;
立方手柄;
状态=创建(&handle);
if(状态!=CUBLAS\u状态\u成功)
{
如果(状态==CUBLAS\u状态\u未初始化){
mexPrintf(“CUBLAS初始化错误”);
}
getchar();
返回;
}
常数双α=1.0f;
常数双贝塔=0.0f;
cublasDgemm(手柄、CUBLAS_OP_N、CUBLAS_OP_N、matrix_size.uiWB、matrix_size.uiHA、matrix_size.uiWA和alpha、d_B、matrix_size.uiWB、d_A、matrix_size.uiWA和beta、d_C、matrix_size.uiWB);
cudaThreadSynchronize();
立方体(把手);
}
还有一种更为土生土长的cuda风格。这两种方法对我都适用
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <time.h>
#include "mex.h"
#include "mxGPUArray.h"
typedef struct _matrixSize
{
unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
} sMatrixSize;
void matrixMultiply(double const* d_A, double const* d_B, double* d_C, sMatrixSize &matrix_size);
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[])
{
mxArray const *mA;
mxArray const *mB;
_matrixSize matrix_size;
size_t A_w, A_h, B_w, B_h;
double *d_A;
double *d_B;
double *d_C;
double *h_A;
double *h_B;
double *h_C;
char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput";
char const * const errMsg = "Invalid input to MEX file.";
if (nrhs != 2) {
mexErrMsgTxt("Need two inputs");
}
mA = prhs[0]; mB = prhs[1];
A_w = mxGetM(mA);A_h = mxGetN(mA);B_w = mxGetM(mB);B_h = mxGetN(mB);
matrix_size.uiWA = (unsigned int)A_w; matrix_size.uiHA = (unsigned int)A_h;
matrix_size.uiWB = (unsigned int)B_w; matrix_size.uiHB = (unsigned int)B_h;
matrix_size.uiWC = (unsigned int)A_w; matrix_size.uiHC = (unsigned int)B_h;
mwSize const C_sz[3] = { matrix_size.uiWA, matrix_size.uiHB, 1 };
unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
unsigned int mem_size_A = sizeof(double) * size_A;
h_A = (double*)mxGetData(mA);
unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
unsigned int mem_size_B = sizeof(double) * size_B;
h_B = (double*)mxGetData(mB);
unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
unsigned int mem_size_C = sizeof(double) * size_C;
plhs[0] = mxCreateNumericArray(3, C_sz, mxDOUBLE_CLASS, mxREAL);
h_C = (double*)mxGetData(plhs[0]);
cudaMalloc((void **)&d_A, mem_size_A);
cudaMalloc((void **)&d_B, mem_size_B);
cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice);
cudaMalloc((void **)&d_C, mem_size_C);
matrixMultiply(d_A, d_B, d_C, matrix_size);
cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
}
void matrixMultiply(double const* d_A, double const* d_B, double* d_C, sMatrixSize &matrix_size)
{
cublasHandle_t handle;
cublasCreate(&handle);
const double alpha = 1.0f;
const double beta = 0.0f;
cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB);
cublasDestroy(handle);
}
#包括
#包括
#包括
#包括“mex.h”
#包括“mxGPUArray.h”
类型定义结构矩阵大小
{
未签署的国际uiWA、uiHA、uiWB、uiHB、uiWC、uiHC;
}sMatrixSize;
无效矩阵倍数(双常数*d_A、双常数*d_B、双*d_C、sMatrixSize和矩阵大小);
无效MEX函数(整数nlhs、mxArray*plhs[]、整数nrhs、mxArray const*prhs[])
{
mx阵列常数*mA;
MX阵列常数*mB;
_矩阵大小矩阵大小;
尺寸A_w,A_h,B_w,B_h;
双d_A;
双*d_B;
双*d_C;
双*h_A;
双*h_B;
双*h_C;
char const*const errId=“parallel:gpu:mexGPUExample:InvalidInput”;
char const*const errMsg=“MEX文件的输入无效。”;
如果(nrhs!=2){
mexErrMsgTxt(“需要两个输入”);
}
mA=prhs[0];mB=prhs[1];
A_w=mxGetM(mA);A_h=mxGetN(mA);B_w=mxGetM(mB);B_h=mxGetN(mB);
矩阵_size.uiWA=(无符号整数)A_w;矩阵_size.uiHA=(无符号整数)A_h;
矩阵_size.uiWB=(无符号整数)B_w;矩阵_size.uiHB=(无符号整数)B_h;
矩阵_size.uiWC=(无符号整数)A_w;矩阵_size.uiHC=(无符号整数)B_h;
mwSize const C_sz[3]={matrix_size.uiWA,matrix_size.uiHB,1};
无符号整数大小=矩阵大小.uiWA*矩阵大小.uiHA;
无符号整数mem_size_A=sizeof(double)*size_A;
h_A=(双*)mxGetData(mA);
无符号整数大小=矩阵大小.uiWB*矩阵大小.uiHB;
无符号整数mem_size_B=sizeof(double)*size_B;
h_B=(双*)mxGetData(mB);
无符号整数大小=矩阵大小.uiWC*矩阵大小.uiHC;
无符号整数mem_size_C=sizeof(double)*size_C;
plhs[0]=mxCreateNumericArray(3,C_sz,mxDOUBLE_类,mxREAL);
h_C=(双*)mxGetData(plhs[0]);
Cudamaloc((空白**)和d_A、mem_大小_A);
Cudamaloc((无效**)和d_B、mem_尺寸_B);
cudaMemcpy(d_A、h_A、mem_size_A、cudamemcpyhostodevice);
cudaMemcpy(d_B,h_B,mem_size_B,cudamemcpyhostodevice);
Cudamaloc((void**)和d_C、mem_size_C);
矩阵倍数(d_A,d_B,d_C,矩阵大小);
cudaMemcpy(h_C、d_C、mem_size_C、cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
库达弗里(杜阿);
库达弗里(杜布);
库达弗里(d_C);
}
无效矩阵倍数(双常数*d_A、双常数*d_B、双*d_C、sMatrixSize和矩阵大小)
{
立方手柄;
cublasCreate(&handle);
常数双α=1.0f;
常数双贝塔=0.0f;
cublasDgemm(手柄、CUBLAS_OP_N、CUBLAS_OP_N、matrix_size.uiWB、matrix_size.uiHA、matrix_size.uiWA和alpha、d_B、matrix_size.uiWB、d_A、matrix_size.uiWA和beta、d_C、matrix_size.uiWB);
立方体(把手);
}
Gemmbatch比大多数cublas更复杂