Cuda CUBLAS:零轴矩阵的不正确反演
自CUDA 5.5以来,CUBLAS库包含用于批量矩阵分解和求逆(分别为和)的例程 从文档中获得指导,我使用这些例程编写了一个nxn矩阵求逆的测试代码。仅当矩阵具有所有非零枢轴时,该代码才会给出正确的输出。将任何轴设置为零会导致错误的结果。我已经用MATLAB验证了结果 我意识到我提供行主矩阵作为输入,而CUBLAS期望列主矩阵,但这并不重要,因为它只会转置结果。可以肯定的是,我也测试了列主输入,但得到了相同的行为 我很困惑,Cuda CUBLAS:零轴矩阵的不正确反演,cuda,matrix-inverse,cublas,Cuda,Matrix Inverse,Cublas,自CUDA 5.5以来,CUBLAS库包含用于批量矩阵分解和求逆(分别为和)的例程 从文档中获得指导,我使用这些例程编写了一个nxn矩阵求逆的测试代码。仅当矩阵具有所有非零枢轴时,该代码才会给出正确的输出。将任何轴设置为零会导致错误的结果。我已经用MATLAB验证了结果 我意识到我提供行主矩阵作为输入,而CUBLAS期望列主矩阵,但这并不重要,因为它只会转置结果。可以肯定的是,我也测试了列主输入,但得到了相同的行为 我很困惑,cublasgetriBatched期望pivot交换信息数组p作为输
cublasgetriBatched
期望pivot交换信息数组p
作为输入,这是来自cublasgetrfBatched
的输出。因此,如果行交换消除了任何零支点,那么反转例程应该自动处理它
如何使用CUBLAS对包含零轴的矩阵进行求逆
以下是一个具有不同测试用例的自包含可编译示例:
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,n * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = n;
float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,n,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == n)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,lda,INFO,batchSize));
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}
void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;
cudacall(cudaMalloc<float>(&src_d,n * n * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,n * n * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d), cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//Random matrix with full pivots
float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };
//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };
float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };
float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };
float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };
float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};
//Select matrix by setting "a"
float* a = zero_pivot;
fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}
fprintf(stdout,"\n\n");
invert(a,a,n);
fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*n+j]);
fprintf(stdout,"\n");
}
}
int main()
{
test_invert();
int n; scanf("%d",&n);
return 0;
}
#包括
#包括
#包括
#包括
#定义cudacall(调用)\
做\
{ \
cudaError\u t err=(调用)\
如果(cudaSuccess!=错误)\
{ \
fprintf(stderr,“CUDA错误:\n文件=%s\n行=%d\n原因=%s\n”、\uuuuuu文件、\uuuuu行、\uuuuuu、cudaGetErrorString(错误))\
cudaDeviceReset()\
退出(退出失败)\
} \
} \
而(0)
#定义cublascall(调用)\
做\
{ \
cublasStatus\u t状态=(调用)\
if(CUBLAS_状态_成功!=状态)\
{ \
fprintf(标准,“CUBLAS错误:\n文件=%s\n行=%d\n代码=%d\n”、\uuuuuu文件、\uuuuu行、状态)\
cudaDeviceReset()\
退出(退出失败)\
} \
\
} \
而(0)
无效反转装置(浮点数*src\U d,浮点数*dst\U d,整数n)
{
立方手柄;
cublascall(cublasCreate_v2(&handle));
int batchSize=1;
int*P,*INFO;
cudacall(cudamaloc&P,n*batchSize*sizeof(int));
cudacall(cudamaloc(&INFO,batchSize*sizeof(int));
int-lda=n;
浮点*A[]={src_d};
浮动**A\u d;
cudacall(cudamaloc&A_d,sizeof(A));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudamemcpyhostodevice));
cublascall(CublasGetRfBatched(句柄,n,A_d,lda,P,信息,batchSize));
int-INFOh=0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh==n)
{
fprintf(stderr,“因式分解失败:矩阵为单数\n”);
cudaDeviceReset();
退出(退出失败);
}
浮点*C[]={dst_d};
浮动**C\d;
cudacall(cudamaloc(&C_d,sizeof(C));
cudacall(cudaMemcpy(cud,C,sizeof(C),cudamemcpyhostodevice));
cublascall(cublasSgetriBatched(句柄,n,A_d,lda,P,C_d,lda,INFO,batchSize));
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
如果(INFOh!=0)
{
fprintf(stderr,“反转失败:矩阵是奇异的\n”);
cudaDeviceReset();
退出(退出失败);
}
cudaFree(P)、cudaFree(INFO)、cublasu v2(handle);
}
空心内底(浮点数*src,浮点数*dst,整数n)
{
浮动*src_d,*dst_d;
cudacall(cudamaloc(&src_d,n*n*sizeof(float));
cudacall(cudaMemcpy(src_d,src,n*n*sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc(&dst_d,n*n*sizeof(float));
反转装置(src_d,dst_d,n);
cudacall(cudaMemcpy(dst、dst_d、n*n*sizeof(float)、cudaMemcpyDeviceToHost));
cudaFree(src_d),cudaFree(dst_d);
}
无效测试_invert()
{
常数int n=3;
//全支点随机矩阵
浮点满_轴[n*n]={0.5,3,4,
1, 3, 10,
4 , 9, 16 };
//几乎
LU = getrf( [A 0 ; 0 I]);
invA = getri( LU(1:3,1:3) )
$ cat t340.cu
#include <cstdio>
#include <cstdlib>
#include <cuda_runtime.h>
#include <cublas_v2.h>
#define cudacall(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
#define cublascall(call) \
do \
{ \
cublasStatus_t status = (call); \
if(CUBLAS_STATUS_SUCCESS != status) \
{ \
fprintf(stderr,"CUBLAS Error:\nFile = %s\nLine = %d\nCode = %d\n", __FILE__, __LINE__, status); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
\
} \
while(0)
void invert_device(float* src_d, float* dst_d, int n)
{
cublasHandle_t handle;
cublascall(cublasCreate_v2(&handle));
int batchSize = 1;
int *P, *INFO;
cudacall(cudaMalloc<int>(&P,17 * batchSize * sizeof(int)));
cudacall(cudaMalloc<int>(&INFO,batchSize * sizeof(int)));
int lda = 17;
float *A[] = { src_d };
float** A_d;
cudacall(cudaMalloc<float*>(&A_d,sizeof(A)));
cudacall(cudaMemcpy(A_d,A,sizeof(A),cudaMemcpyHostToDevice));
cublascall(cublasSgetrfBatched(handle,17,A_d,lda,P,INFO,batchSize));
int INFOh = 0;
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh == 17)
{
fprintf(stderr, "Factorization Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
float* C[] = { dst_d };
float** C_d;
cudacall(cudaMalloc<float*>(&C_d,sizeof(C)));
cudacall(cudaMemcpy(C_d,C,sizeof(C),cudaMemcpyHostToDevice));
cublascall(cublasSgetriBatched(handle,n,A_d,lda,P,C_d,n,INFO,batchSize));
cudacall(cudaMemcpy(&INFOh,INFO,sizeof(int),cudaMemcpyDeviceToHost));
if(INFOh != 0)
{
fprintf(stderr, "Inversion Failed: Matrix is singular\n");
cudaDeviceReset();
exit(EXIT_FAILURE);
}
cudaFree(P), cudaFree(INFO), cublasDestroy_v2(handle);
}
void invert(float* src, float* dst, int n)
{
float* src_d, *dst_d;
cudacall(cudaMalloc<float>(&src_d,17 * 17 * sizeof(float)));
cudacall(cudaMemcpy(src_d,src,17 * 17 * sizeof(float),cudaMemcpyHostToDevice));
cudacall(cudaMalloc<float>(&dst_d,n * n * sizeof(float)));
invert_device(src_d,dst_d,n);
cudacall(cudaMemcpy(dst,dst_d,n * n * sizeof(float),cudaMemcpyDeviceToHost));
cudaFree(src_d), cudaFree(dst_d);
}
void test_invert()
{
const int n = 3;
//Random matrix with full pivots
/* float full_pivots[n*n] = { 0.5, 3, 4,
1, 3, 10,
4 , 9, 16 };
//Almost same as above matrix with first pivot zero
float zero_pivot[n*n] = { 0, 3, 4,
1, 3, 10,
4 , 9, 16 };
float zero_pivot_col_major[n*n] = { 0, 1, 4,
3, 3, 9,
4 , 10, 16 };
*/
float zero_pivot_war[17*17] = {
0,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4,9,16,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 };
/*
float another_zero_pivot[n*n] = { 0, 3, 4,
1, 5, 6,
9, 8, 2 };
float another_full_pivot[n * n] = { 22, 3, 4,
1, 5, 6,
9, 8, 2 };
float singular[n*n] = {1,2,3,
4,5,6,
7,8,9};
*/
float result[n*n];
//Select matrix by setting "a"
float* a = zero_pivot_war;
fprintf(stdout, "Input:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",a[i*17+j]);
fprintf(stdout,"\n");
}
fprintf(stdout,"\n\n");
invert(a,result,n);
fprintf(stdout, "Inverse:\n\n");
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
fprintf(stdout,"%f\t",result[i*n+j]);
fprintf(stdout,"\n");
}
}
int main()
{
test_invert();
// int n; scanf("%d",&n);
return 0;
}
$ nvcc -arch=sm_20 -o t340 t340.cu -lcublas
$ cuda-memcheck ./t340
========= CUDA-MEMCHECK
Input:
0.000000 3.000000 4.000000
1.000000 3.000000 10.000000
4.000000 9.000000 16.000000
Inverse:
-0.700000 -0.200000 0.300000
0.400000 -0.266667 0.066667
-0.050000 0.200000 -0.050000
========= ERROR SUMMARY: 0 errors
$