Cuda LU分解在LAPACK和cuBLAS/cuSOLVER之间得到不同的结果
我正在测试一些场景,其中与Cuda LU分解在LAPACK和cuBLAS/cuSOLVER之间得到不同的结果,cuda,linear-algebra,lapack,cublas,cusolver,Cuda,Linear Algebra,Lapack,Cublas,Cusolver,我正在测试一些场景,其中与cuBLAS/cuSOLVER一起使用时,函数dgetrf的返回方式与为LAPACK编写时不同。例如,我正在研究以下矩阵的LU分解: 2.0 4.0 1.0-3.0 0 0.0 -1.0-2.0 2.0 4.0 0.0 4.02.0-3.05.0.0 5.0-4.0-3.0 1.0 0.0 0.0.0.0.0.0.0.0 我首先尝试从cuBLAS/cuSOLVER调用dgetrf,如下所示(警告,前面的测试代码很难看!) 当我尝试对LAPACK执行同样的操作时(警告:更
cuBLAS/cuSOLVER
一起使用时,函数dgetrf
的返回方式与为LAPACK
编写时不同。例如,我正在研究以下矩阵的LU分解:
2.0 4.0 1.0-3.0 0 0.0
-1.0-2.0 2.0 4.0 0.0
4.02.0-3.05.0.0
5.0-4.0-3.0 1.0 0.0
0.0.0.0.0.0.0.0
我首先尝试从cuBLAS/cuSOLVER
调用dgetrf
,如下所示(警告,前面的测试代码很难看!)
当我尝试对LAPACK执行同样的操作时(警告:更难看的代码!):
我知道它们是不同的库,但这是预期的行为吗?当我编译您的CUDA代码时,我收到一条警告,提示在设置其值之前正在使用cusolver句柄。您不应该忽略这些警告,因为您在调整大小功能中的用法不正确。然而,这不是问题所在 我认为你的两个测试用例之间没有任何区别。你似乎对结果的解释不正确 查看,我们看到
info
值5 meanU(5,5)
为零,这对于将来的使用是有问题的。这并不意味着打印输出时,dgetrf
因式分解成功或失败,而是意味着输入数据的某些方面。事实上,正如文件中明确指出的那样,因子分解已经完成
类似地,我们仅仅通过查看cusolver函数的函数返回值,无法获得关于该条件的任何信息。为了发现与lapack报告的内容类似的信息,其
通过这些更改,您的代码将报告相同的内容(信息值为5):
$cat t1556.cu
#包括
#包括
#包括
#包括
#包括
#包括
int main(int argc,字符**argv){
常数int matrixSize=5;
int i,j;
双数组[matrixSize][matrixSize]={
{2.0, 4.0, 1.0, -3.0, 0.0},
{-1.0, -2.0, 2.0, 4.0, 0.0},
{4.0, 2.0, -3.0, 5.0, 0.0},
{5.0, -4.0, -3.0, 1.0, 0.0},
{0.0, 0.0, 0.0, 0.0, 0.0}
};
双*arrADev,*workArray;
双**矩阵阵;
int*数据透视数组;
int*信息数组;
双平坦[matrixSize*matrixSize]={0};
cublasHandle_t cublasHandle;
库拉索塔图;
cudaError\t错误;
cudaError cudaStatus;
库索弗斯塔特斯(cusolverStatus_t cusolverStatus);;
cusolverDnHandle\u t cusolverHandle;
双*矩阵[2];
错误=cudaMalloc(&arrADev,sizeof(double)*matrixSize*matrixSize);
如果(error!=cudaSuccess)fprintf(stderr,“\n错误:%s\n”,cudaGetErrorString(error));
错误=cudaMalloc(&matrixArray,sizeof(double*)*2);
如果(error!=cudaSuccess)fprintf(stderr,“\n错误:%s\n”,cudaGetErrorString(error));
error=cudamaloc(&pivotary,sizeof(int)*matrixSize*matrixSize);
如果(error!=cudaSuccess)fprintf(stderr,“\n错误:%s\n”,cudaGetErrorString(error));
错误=cudaMalloc(&infoArray,sizeof(int)*matrixSize*matrixSize);
如果(error!=cudaSuccess)fprintf(stderr,“\n错误:%s\n”,cudaGetErrorString(error));
cublastatus=cublasCreate(&cublasHandle);
if(cublasStatus!=CUBLAS_STATUS_SUCCESS)fprintf(stderr,“错误%i\n”,cublasStatus);
//将矩阵映射到平面向量
对于(i=0;i当我编译CUDA代码时,我收到一条警告,提示在设置cusolver句柄的值之前正在使用该句柄。您不应该忽略此类警告,因为您在大小调整函数中的用法不正确。但这不是问题所在
我认为您的两个测试用例之间没有任何区别。您似乎对结果的解释不正确
查看,我们看到info
5的值表示U(5,5)
为零,这对于将来的使用来说是有问题的。这并不意味着打印时,dgetrf
分解成功或失败,而是意味着输入数据的某些方面。事实上,分解已经完成,正如文档中明确指出的那样
类似地,我们仅仅通过查看cusolver函数的函数返回值,并没有得到关于该条件的任何信息
通过这些更改,您的代码将报告相同的内容(信息值为5):
$cat t1556.cu
#包括
#包括
#包括
#包括
#包括
#包括
int main(int argc,字符**argv){
常数int matrixSize=5;
int i,j;
双数组[matrixSize][matrixSize]={
{2.0, 4.0, 1.0, -3.0, 0.0},
{-1.0, -2.0, 2.0, 4.0, 0.0},
{4.0, 2.0, -3.0, 5.0, 0.0},
{5.0, -4.0, -3.0, 1.0, 0.0},
{0.0, 0.0, 0.0, 0.0, 0.0}
};
双*arrADev,*workArray;
双**矩阵阵;
int*数据透视数组;
int*信息数组;
双平坦[matrixSize*matrixSize]={0};
cublasHandle_t cublasHandle;
库拉索塔图;
cudaError\t错误;
cudaError cudaStatus;
库索弗斯塔特斯(cusolverStatus_t cusolverStatus);;
cusolverDnHandle\u t cusolverHandle;
双*矩阵[2];
错误=cudaMalloc(&arrADev,sizeof(double)*matrixSize*matrixSize);
如果(error!=cudaSuccess)fprintf(stderr,“\n错误:%s\n”,cudaGetErrorString(error));
错误=cudaMalloc(&matrixArray,sizeof(double*)*2);
如果(error!=cudaSuccess)fprintf(stderr,“\n错误:%s\n”,cudaGetErrorString(error));
error=cudamaloc(&pivotary,sizeof(int)*matrixSize*matrixSize);
如果(error!=cudaSuccess)fprintf(stderr,“\n错误:%s\n”,cudaGetE
#include <cblas.h>
#include <time.h>
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
int main(int argc, char** argv){
const int matrixSize = 5;
int i, j;
double arrA[matrixSize][matrixSize] = {
{2.0, 4.0, 1.0, -3.0, 0.0},
{-1.0, -2.0, 2.0, 4.0, 0.0},
{4.0, 2.0, -3.0, 5.0, 0.0},
{5.0, -4.0, -3.0, 1.0, 0.0},
{0.0, 0.0, 0.0, 0.0, 0.0}
};
double *arrADev, *workArray;
double **matrixArray;
int *pivotArray;
int *infoArray;
double flat[matrixSize*matrixSize] = {0};
cublasHandle_t cublasHandle;
cublasStatus_t cublasStatus;
cudaError_t error;
cudaError cudaStatus;
cusolverStatus_t cusolverStatus;
cusolverDnHandle_t cusolverHandle;
double *matrices[2];
error = cudaMalloc(&arrADev, sizeof(double) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&matrixArray, sizeof(double*) * 2);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&pivotArray, sizeof(int) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&infoArray, sizeof(int) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
cublasStatus = cublasCreate(&cublasHandle);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);
//maps matrix to flat vector
for(i=0; i<matrixSize; i++){
for(j=0; j<matrixSize; j++){
flat[i+j*matrixSize] = arrA[i][j];
}
}
//copy matrix A to device
cublasStatus = cublasSetMatrix(matrixSize, matrixSize, sizeof(double), flat, matrixSize, arrADev, matrixSize);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);
//save matrix address
matrices[0] = arrADev;
//copy matrices references to device
error = cudaMemcpy(matrixArray, matrices, sizeof(double*)*1, cudaMemcpyHostToDevice);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
int Lwork;
// calculate buffer size for cuSOLVER LU factorization
cusolverStatus = cusolverDnDgetrf_bufferSize(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, &Lwork);
cudaStatus = cudaMalloc((void**)&workArray, Lwork*sizeof(double));
// cuBLAS LU factorization
cublasStatus = cublasDgetrfBatched(cublasHandle, matrixSize, matrixArray, matrixSize, pivotArray, infoArray, 1);
if (cublasStatus == CUBLAS_STATUS_SUCCESS)
printf("cuBLAS DGETRF SUCCESSFUL! \n");
else
printf("cuBLAS DGETRF UNSUCCESSFUL! \n");
// cuSOLVER LU factorization
cusolverStatus = cusolverDnCreate(&cusolverHandle);
cusolverStatus = cusolverDnDgetrf(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, workArray, pivotArray, infoArray);
if (cusolverStatus == CUSOLVER_STATUS_SUCCESS)
printf("cuSOLVER DGETRF SUCCESSFUL! \n");
else
printf("cuSOLVER DGETRF UNSUCCESSFUL! \n");
return 0;
}
cuBLAS DGETRF SUCCESSFUL!
cuSOLVER DGETRF SUCCESSFUL!
#include <iostream>
#include <vector>
using namespace std;
extern "C" void dgetrf_(int* dim1, int* dim2, double* a, int* lda, int* ipiv, int* info);
extern "C" void dgetrs_(char *TRANS, int *N, int *NRHS, double *A, int *LDA, int *IPIV, double *B, int *LDB, int *INFO );
int main()
{
char trans = 'N';
int dim = 5;
int LDA = dim;
int info;
vector<double> a,b;
a.push_back(2.0); a.push_back(4.0); a.push_back(1.0); a.push_back(-3.0); a.push_back(0.0);
a.push_back(-1.0); a.push_back(-2.0); a.push_back(2.0); a.push_back(4.0); a.push_back(0.0);
a.push_back(4.0); a.push_back(2.0); a.push_back(-3.0); a.push_back(5.0); a.push_back(0.0);
a.push_back(5.0); a.push_back(-4.0); a.push_back(-3.0); a.push_back(1.0); a.push_back(0.0);
a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0);
int ipiv[5];
dgetrf_(&dim, &dim, &*a.begin(), &LDA, ipiv, &info);
if (info == 0)
printf("dgetrf successful\n");
else
printf("dgetrf unsuccessful\n");
return 0;
}
dgetrf unsuccessful
$ cat t1556.cu
#include <time.h>
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cusolverDn.h>
int main(int argc, char** argv){
const int matrixSize = 5;
int i, j;
double arrA[matrixSize][matrixSize] = {
{2.0, 4.0, 1.0, -3.0, 0.0},
{-1.0, -2.0, 2.0, 4.0, 0.0},
{4.0, 2.0, -3.0, 5.0, 0.0},
{5.0, -4.0, -3.0, 1.0, 0.0},
{0.0, 0.0, 0.0, 0.0, 0.0}
};
double *arrADev, *workArray;
double **matrixArray;
int *pivotArray;
int *infoArray;
double flat[matrixSize*matrixSize] = {0};
cublasHandle_t cublasHandle;
cublasStatus_t cublasStatus;
cudaError_t error;
cudaError cudaStatus;
cusolverStatus_t cusolverStatus;
cusolverDnHandle_t cusolverHandle;
double *matrices[2];
error = cudaMalloc(&arrADev, sizeof(double) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&matrixArray, sizeof(double*) * 2);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&pivotArray, sizeof(int) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
error = cudaMalloc(&infoArray, sizeof(int) * matrixSize*matrixSize);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
cublasStatus = cublasCreate(&cublasHandle);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);
//maps matrix to flat vector
for(i=0; i<matrixSize; i++){
for(j=0; j<matrixSize; j++){
flat[i+j*matrixSize] = arrA[i][j];
}
}
//copy matrix A to device
cublasStatus = cublasSetMatrix(matrixSize, matrixSize, sizeof(double), flat, matrixSize, arrADev, matrixSize);
if (cublasStatus != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error %i\n",cublasStatus);
//save matrix address
matrices[0] = arrADev;
//copy matrices references to device
error = cudaMemcpy(matrixArray, matrices, sizeof(double*)*1, cudaMemcpyHostToDevice);
if (error != cudaSuccess) fprintf(stderr,"\nError: %s\n",cudaGetErrorString(error));
int Lwork;
// calculate buffer size for cuSOLVER LU factorization
cusolverStatus = cusolverDnCreate(&cusolverHandle);
cusolverStatus = cusolverDnDgetrf_bufferSize(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, &Lwork);
cudaStatus = cudaMalloc((void**)&workArray, Lwork*sizeof(double));
// cuBLAS LU factorization
cublasStatus = cublasDgetrfBatched(cublasHandle, matrixSize, matrixArray, matrixSize, pivotArray, infoArray, 1);
if (cublasStatus == CUBLAS_STATUS_SUCCESS)
printf("cuBLAS DGETRF SUCCESSFUL! \n");
else
printf("cuBLAS DGETRF UNSUCCESSFUL! \n");
// cuSOLVER LU factorization
cusolverStatus = cusolverDnDgetrf(cusolverHandle, matrixSize, matrixSize, arrADev, matrixSize, workArray, pivotArray, infoArray);
if (cusolverStatus == CUSOLVER_STATUS_SUCCESS)
printf("cuSOLVER DGETRF SUCCESSFUL! \n");
else
printf("cuSOLVER DGETRF UNSUCCESSFUL! \n");
int *hinfoArray = (int *)malloc(matrixSize*matrixSize*sizeof(int));
cudaMemcpy(hinfoArray, infoArray, matrixSize*matrixSize*sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < matrixSize*matrixSize; i++) printf("%d,", hinfoArray[i]);
printf("\n");
return 0;
}
$ nvcc -o t1556 t1556.cu -lcublas -lcusolver
t1556.cu(30): warning: variable "cudaStatus" was set but never used
$ ./t1556
cuBLAS DGETRF SUCCESSFUL!
cuSOLVER DGETRF SUCCESSFUL!
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
$ cat t1557.cpp
#include <iostream>
#include <vector>
#include <lapacke/lapacke.h>
using namespace std;
// extern "C" void dgetrf_(int* dim1, int* dim2, double* a, int* lda, int* ipiv, int* info);
// extern "C" void dgetrs_(char *TRANS, int *N, int *NRHS, double *A, int *LDA, int *IPIV, double *B, int *LDB, int *INFO );
int main()
{
char trans = 'N';
int dim = 5;
int LDA = dim;
int info;
vector<double> a,b;
a.push_back(2.0); a.push_back(4.0); a.push_back(1.0); a.push_back(-3.0); a.push_back(0.0);
a.push_back(-1.0); a.push_back(-2.0); a.push_back(2.0); a.push_back(4.0); a.push_back(0.0);
a.push_back(4.0); a.push_back(2.0); a.push_back(-3.0); a.push_back(5.0); a.push_back(0.0);
a.push_back(5.0); a.push_back(-4.0); a.push_back(-3.0); a.push_back(1.0); a.push_back(0.0);
a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0); a.push_back(0.0);
int ipiv[5];
LAPACK_dgetrf(&dim, &dim, &*a.begin(), &LDA, ipiv, &info);
printf("info = %d\n", info);
if (info == 0)
printf("dgetrf successful\n");
else
printf("dgetrf unsuccessful\n");
return 0;
}
$ g++ t1557.cpp -o t1557 -llapack
$ ./t1557
info = 5
dgetrf unsuccessful
$