在内核内调用cublas函数时编译CUDA代码

在内核内调用cublas函数时编译CUDA代码,cuda,gpu,gpgpu,nvidia,cublas,Cuda,Gpu,Gpgpu,Nvidia,Cublas,我正在尝试运行一个非常简单的内核,它调用cublassgemm函数。我的代码是: __global__ void cor (float * dev_mat,float * dev_cor,int n,cublasHandle_t handle) { const float alpha = 1.0; const float beta = 0; cublasStatus_t stat; stat = cublasSgemm(handle

我正在尝试运行一个非常简单的内核,它调用cublassgemm函数。我的代码是:

__global__ void cor (float * dev_mat,float * dev_cor,int n,cublasHandle_t handle)
{
        const float alpha = 1.0;
        const float beta = 0;
        cublasStatus_t stat;
        stat = cublasSgemm(handle, CUBLAS_OP_N,  CUBLAS_OP_N, n, n, n,  &alpha, dev_mat, n, dev_mat,n,&beta, dev_cor, n);
        if(stat != CUBLAS_STATUS_SUCCESS)
                {
                        cout<<"error in cublas sgemm \n";
                }
}
int main()
{
int  m =1000,n = 1000;
float * h_mat = new float[m*n];
float * h_cor = new float[m*n];
float * dev_mat,*dev_cor;
cudaMalloc(&dev_mat,m*n*sizeof(float));
cudaMalloc(&dev_cor,m*n*sizeof(float));
for (int i = 0; i< m; i++)
        for(int j = 0; j <n;j++)
                {
                        h_mat[i*n+j]=rand()%10;
                }
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasSetMatrix(m, n, sizeof(float), h_mat, m, dev_mat, m);
if(stat !=CUBLAS_STATUS_SUCCESS)
        {
                cout<<"error in cublassetmatrix   \n";
                return stat;
        }
stat = cublasCreate (&handle);
if(stat != CUBLAS_STATUS_SUCCESS)
        {
                cout<<"error in cublas create handle \n";
                return stat;
        }

cor<<<1,1>>>(dev_mat,dev_cor,n,handle);
cudaFree(dev_mat);
delete []h_mat;
delete []h_cor;
return 0;
}
\uuuuuuuuuuuuuuuvoid cor(浮点*偏差垫,浮点*偏差系数,整数n,立方手柄)
{
常数浮点α=1.0;
常数浮点β=0;
库布拉斯塔图斯统计局;
stat=cublasSgemm(句柄、CUBLAS_OP_N、CUBLAS_OP_N、N、N和alpha、dev_mat、N、dev_mat、N和beta、dev_cor、N);
if(stat!=CUBLAS\u STATUS\u SUCCESS)
{
cout有两个问题(至少):

  • 您不能在CUDA内核中使用
    cout
    。将其更改为等效的
    printf
    语句,您应该可以:

    printf("error in cublas sgemm \n");
    
  • 您的compile命令不正确。您提供的链接显示了必要的组件。您应该使用以下内容:

    nvcc -arch=sm_35 -rdc=true -o cublassgemm_inside_kernel cublassgemm_inside_kernel.cu -lcublas -lcublas_device -lcudadevrt
    
  • 当然,这只适用于cc3.5或更高的GPU