C++ 无法使cublasSgelsbatched函数工作
我目前正在尝试让cublasSgelsbatched()版本正常工作。我首先做了一个小测试用例,看看到底需要哪些参数,以及如何输入这些参数。然而,经过多次尝试和错误后,我仍然无法让它工作,我得到了13的状态返回,对应于CUBLAS_status_EXECUTION_FAILED,这是一个非常模糊的错误,我还尝试了其他一些CUBLAS测试用例,它们似乎工作正常。我还在MATlab中测试了输入矩阵,它确实有一个LS解C++ 无法使cublasSgelsbatched函数工作,c++,cuda,cublas,C++,Cuda,Cublas,我目前正在尝试让cublasSgelsbatched()版本正常工作。我首先做了一个小测试用例,看看到底需要哪些参数,以及如何输入这些参数。然而,经过多次尝试和错误后,我仍然无法让它工作,我得到了13的状态返回,对应于CUBLAS_status_EXECUTION_FAILED,这是一个非常模糊的错误,我还尝试了其他一些CUBLAS测试用例,它们似乎工作正常。我还在MATlab中测试了输入矩阵,它确实有一个LS解 #include "stdafx.h" #include &q
#include "stdafx.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <algorithm>
#include <cmath>
#include <Windows.h>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t stat;
// create handle
stat = cublasCreate(&m_cuBLAS);
//params
const int C = 3;
const int M = 2;
long lda = C;
long ldb = M;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
&gAmat,
lda, //or 1
&gYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
#包括“stdafx.h”
#包括“设备启动参数.h”
#包括
#包括
#包括
#包括
#包括“cublas_v2.h”
#包括
#包括
#包括
int main()
{
//初始化id、句柄和stat
int id=cudaGetDevice(&id);
库布拉山德勒穆库布拉斯;
库布拉斯塔图斯统计局;
//创建句柄
stat=cublasCreate(&m_cuBLAS);
//params
常数int C=3;
常数int M=2;
长lda=C;
长ldb=M;
//初始变量
浮动*Amat、*Ymat、*Xmat;
浮球*gAmat,*gYmat;
//分配内存
Amat=(浮动*)malloc(M*C*sizeof(浮动));
Ymat=(浮动*)malloc(C*sizeof(浮动));
Xmat=(float*)malloc(M*sizeof(float));
srand(100);
对于(int i=0;i std::cout正如注释中所指出的,您没有在设备上创建正确的指针数组。与设备内存中的指针数组一起工作,用于数据参数,例如:
Aarray设备指向数组的指针的输入/输出数组,每个数组的维数为.m x n,lda>=max(1,m)。矩阵Aarray[i]不应重叠;否则,预期会出现未定义的行为
例如,传递&gAmat
似乎满足类型要求,但该指针不指向设备内存
以下对代码的修改侧重于正确处理gAmat
和gYmat
对我来说似乎运行正常:
$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <algorithm>
#include <cmath>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t stat;
// create handle
stat = cublasCreate(&m_cuBLAS);
//params
const int C = 3;
const int M = 2;
long lda = C;
long ldb = M;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float **ggAmat, **ggYmat;
cudaMalloc(&ggAmat, sizeof(float*));
cudaMalloc(&ggYmat, sizeof(float*));
cudaMemcpy(ggAmat, &gAmat, sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(ggYmat, &gYmat, sizeof(float*), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
cublasStatus_t status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
ggAmat,
lda, //or 1
ggYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
$ nvcc -o t130 t130.cu -lcublas
t130.cu(15): warning: variable "stat" was set but never used
t130.cu(24): warning: variable "ldb" was declared but never referenced
$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
info = 0
devInfoArray = 0
-0.0226168, 0.514827, -4.29722
========= ERROR SUMMARY: 0 errors
$
$cat t130.cu
#包括
#包括
#包括
#包括
#包括
#包括
#包括
int main()
{
//初始化id、句柄和stat
int id=cudaGetDevice(&id);
库布拉山德勒穆库布拉斯;
库布拉斯塔图斯统计局;
//创建句柄
stat=cublasCreate(&m_cuBLAS);
//params
常数int C=3;
常数int M=2;
长lda=C;
长ldb=M;
//初始变量
浮动*Amat、*Ymat、*Xmat;
浮球*gAmat,*gYmat;
//分配内存
Amat=(浮动*)malloc(M*C*sizeof(浮动));
Ymat=(浮动*)malloc(C*sizeof(浮动));
Xmat=(float*)malloc(M*sizeof(float));
srand(100);
对于(int i=0;i std::cout--“Aarray是指向以列主格式存储的矩阵的指针数组”。我在你的代码中没有看到任何指向矩阵的指针数组,是吗?谢谢你的反应,它现在运行,这比以前好多了,但是当我运行与你相同的代码时,我得到了状态13的响应,加上我得到了错误的答案。Matlab告诉我结果应该是[-6.5,9.7]。使用a=[6,7,6,5,5]
和Y=[9,3,10]
那么你认为这与安装有关吗?当我使用你建议的A和Y值运行时,我得到的前两个输出值实际上是-6.5和9.7,并且报告的状态为零。因此,如果你遇到错误,我想你的安装可能有问题。如果你有新的CUDA安装,通常最好使用一个或多个示例代码验证操作。我已使用非随机值更新了此测试的答案。
$ cat t130.cu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <algorithm>
#include <cmath>
int main()
{
//init id, handle and stat
int id = cudaGetDevice(&id);
cublasHandle_t m_cuBLAS;
cublasStatus_t status;
// create handle
status = cublasCreate(&m_cuBLAS);
std::cout << "status = " << status << std::endl;
//params
const int C = 3;
const int M = 2;
long lda = C;
//init variables
float *Amat, *Ymat, *Xmat;
float *gAmat, *gYmat;
//allocate mem
Amat = (float*) malloc(M * C * sizeof(float));
Ymat = (float*) malloc(C * sizeof(float));
Xmat = (float*) malloc(M * sizeof(float));
srand(100);
#if 0
for (int i = 0; i < C * M; i++) {
Amat[i] = rand() % 10 + 1;
Amat[i] = (float)Amat[i];
}
for (int i = 0; i < C; i++) {
Ymat[i] = rand() % 10 + 1;
Ymat[i] = (float)Ymat[i];
}
#endif
Amat[0] = 6;
Amat[1] = 7;
Amat[2] = 6;
Amat[3] = 5;
Amat[4] = 5;
Amat[5] = 5;
Ymat[0] = 9;
Ymat[1] = 3;
Ymat[2] = 10;
//allocate mem
cudaMalloc( &gAmat, M * C * sizeof(float));
cudaMalloc( &gYmat, C * sizeof(float));
//copy mem
cudaMemcpy(gAmat, Amat, M * C * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(gYmat, Ymat, C * 1 * sizeof(float), cudaMemcpyHostToDevice);
float **ggAmat, **ggYmat;
cudaMalloc(&ggAmat, sizeof(float*));
cudaMalloc(&ggYmat, sizeof(float*));
cudaMemcpy(ggAmat, &gAmat, sizeof(float*), cudaMemcpyHostToDevice);
cudaMemcpy(ggYmat, &gYmat, sizeof(float*), cudaMemcpyHostToDevice);
//init info params
int info = 0;
int devInfoArray[1] = { 0 };
//Synchronize (not necesarry I think, but just to test)
cudaDeviceSynchronize();
//run cublas
status = cublasSgelsBatched(m_cuBLAS,
CUBLAS_OP_N,
C,
M,
1,
ggAmat,
lda, //or 1
ggYmat,
lda,
&info,
NULL,
1);
//Output info
std::cout << "status = " << status << std::endl;
std::cout << "info = " << info << std::endl;
std::cout << "devInfoArray = " << devInfoArray[0] << std::endl;
cudaMemcpy(Xmat, gYmat, C * 1 * sizeof(float), cudaMemcpyDeviceToHost);
//Output printed
std::cout << Xmat[0] << ", " << Xmat[1] << ", " << Xmat[2] << std::endl;
//free memory
free(Amat);
free(Ymat);
free(Xmat);
cudaFree(gAmat);
cudaFree(gYmat);
//destory handle
cublasDestroy(m_cuBLAS);
return 0;
}
$ nvcc -o t130 t130.cu -lcublas
$ cuda-memcheck ./t130
========= CUDA-MEMCHECK
status = 0
status = 0
info = 0
devInfoArray = 0
-6.5, 9.7, 0.707106
========= ERROR SUMMARY: 0 errors
$