Algorithm cuda矩阵逆高斯约当

Algorithm cuda矩阵逆高斯约当,algorithm,matrix,cuda,gaussian,inverse,Algorithm,Matrix,Cuda,Gaussian,Inverse,我没有发现任何类似的问题。 我在写高斯-乔丹逆矩阵算法。该算法的思想很简单:) 我只想求下三角矩阵的逆。我得到了几乎正确的答案。我哪里做错了什么?有人能帮我吗?我会很感激的 下三角矩阵(nxn) dI单位矩阵(nxn) n一个方向上矩阵的大小(n%16=0) dim3螺纹锁固胶(n/16,n/16) dim3(16,16) 我知道这是一个简单的实现,但首先我需要它正常工作:) 有人能帮我或给我任何提示吗? 我会很感激的。非常感谢 这是整个cpu代码: #include <stdio

我没有发现任何类似的问题。 我在写高斯-乔丹逆矩阵算法。该算法的思想很简单:)

我只想求下三角矩阵的逆。我得到了几乎正确的答案。我哪里做错了什么?有人能帮我吗?我会很感激的

  • 下三角矩阵(nxn)
  • dI单位矩阵(nxn)
  • n一个方向上矩阵的大小(n%16=0)

  • dim3螺纹锁固胶(n/16,n/16)

  • dim3(16,16)
我知道这是一个简单的实现,但首先我需要它正常工作:) 有人能帮我或给我任何提示吗? 我会很感激的。非常感谢

这是整个cpu代码:

  #include <stdio.h>
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#pragma comment(lib, "cuda.lib")
#pragma comment(lib, "cudart.lib")
#include <cuda.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include "device_launch_parameters.h"
#include <cublas_v2.h>

using namespace std;

 __global__ void gaussjordan(float *A,  float *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    float P;

    if(x<n && y<n)
        if(x>i)
            if(y>=i){
                P=A[x*n+i]/A[i*n+i];
                I[x*n+y]-= I[i*n+y]*P;
                A[x*n+y]-= A[i*n+y]*P;
            }
            __syncthreads();
 }


 __global__ void dev(float *d_A,  float *dI, int h)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if(x<h && y<h)
        if(d_A[x*h+x]!=0){
            dI[x*h+y]  /= d_A[x*h+x];
            d_A[x*h+y] /= d_A[x*h+x];
        }
    __syncthreads();

}

void savetofile(float *A, string s, int n, int h)
{
    std::ofstream plik;
    plik.open(s);

    for(int j=0;j<h;j++){
        for(int i=0;i<h;i++){
            plik<<A[j*n+i]<<"\t";}
            plik<<endl;}
    plik.close();
}

int main()
{
    int n = 16;
// creating input
    float *iL = new float [n*n];
    float *L = new float [n*n];
    for(int i=0;i<n;i++)
        for(int j=0;j<n;j++)
            if(i==j || i>j) L[i*n+j] = (i*n+j+1)*(i*n+j+1)*0.007 + (i*n+j+1)*0.01 -(i*n+j+1)*(i*n+j+1)*(i*n+j+1)*0.0005;
            else L[i*n+j] = 0;

    savetofile(L,"L.txt",n,n);

    cout<<"inv\n";
    float *d_A, *d_L,*I, *dI;
    float time;
    cudaError_t err;
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    int ddsize = n*n*sizeof(float);

    dim3 threadsPerBlock(n/16,n/16);
    dim3 numBlocks(16,16);
// memory allocation    
    err= cudaMalloc( (void**)  &d_A, ddsize);   if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
    err= cudaMalloc( (void**)   &dI, ddsize);   if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
    I = new float[n*n];

    for(int i=0;i<n;i++){
        for(int j=0;j<n;j++){
            if(i==j) I[i*n+i]=1.0;
                else I[i*n+j]=0.0;}}
 //copy data from GPU to CPU
    err =cudaMemcpy(  d_A,    L, ddsize, cudaMemcpyHostToDevice); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
    err =cudaMemcpy(   dI,    I, ddsize, cudaMemcpyHostToDevice);  if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;}
//timer start
    cudaEventRecord( start, 0);
// L^(-1)    
    for(int i=0;i<n;i++){
        gaussjordan<<<numBlocks,threadsPerBlock>>>(d_A, dI, n, i);
    }
    dev<<<numBlocks,    threadsPerBlock>>>(d_A, dI, n); 

    err = cudaMemcpy(iL, dI, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} 
    err = cudaMemcpy(L, d_A, ddsize, cudaMemcpyDeviceToHost ); if(err!=cudaSuccess){cout<<cudaGetErrorString(err)<<" in "<<__FILE__<<" at line "<< __LINE__<<endl;} 

    cudaEventRecord( stop, 0 );
    cudaEventSynchronize( stop );
    cudaEventElapsedTime( &time, start, stop );
    cudaEventDestroy( start );
    cudaEventDestroy( stop );

    std::cout<<"Cuda Time - inverse: "<< time <<"ms\n";
    savetofile(iL,"inv.txt",n,n);
    savetofile(L,"I.txt",n,n);
    cudaFree(d_A);
    cudaFree(dI);
    delete []I;
    delete []L;
    delete []iL;
    system("Pause");
 return 0;
}
它应该是:

60,6060606060606    4,44089209850063e-16    4,85722573273506e-17    -3,12250225675825e-17   0   1,73472347597681e-18    -1,08420217248550e-18   -7,58941520739853e-19   4,33680868994202e-19    -5,42101086242752e-19   0   -6,93889390390723e-18   0   -1,38777878078145e-17   0   1,18720137887163e-17
-34,1232841232841   -2,13675213675214   0   8,67361737988404e-18    3,03576608295941e-18    8,67361737988404e-19    -1,73472347597681e-18   1,35525271560688e-18    -8,67361737988404e-19   1,00288700954909e-18    0   0   6,93889390390723e-18    6,93889390390723e-18    -1,38777878078145e-17   3,02221355580334e-18
-17,9130271437964   1,91603268526345    -0,0799200799200800 1,30104260698261e-18    1,95156391047391e-18    -9,75781955236954e-19   1,95156391047391e-18    2,16840434497101e-19    -3,52365706057789e-19   -1,62630325872826e-19   1,38777878078145e-17    -3,46944695195361e-18   0   0   0   -2,72405795836983e-18
-2,86140643299924   0,0760191125748172  0,0748166415934231  -0,0196633632216454 -2,41234983378025e-18   7,99599102208060e-19    3,25260651745651e-19    -4,74338450462408e-19   2,67662411332359e-19    2,91379333855479e-19    -2,16840434497101e-18   -4,33680868994202e-19   1,30104260698261e-18    0   0   6,86096687275983e-20
-1,33482739506506   0,0346053236774996  0,00125734163772674 0,0187469132242915  -0,00767825058738617    5,35324822664718e-19    -2,23616698075135e-19   5,08219768352580e-20    5,92923063078010e-20    1,74488787134386e-19    -4,33680868994202e-19   4,33680868994202e-19    -2,16840434497101e-19   2,16840434497101e-19    0   -1,19008129089229e-19
-0,793561224702690  0,0203250367373064  0,000727127971238783    0,000177630032830862    0,00739591005669882 -0,00376795430225022    4,98055372985529e-19    -3,84552958053452e-19   3,20178454062126e-19    -1,35525271560688e-19   6,50521303491303e-19    -1,08420217248550e-19   1,08420217248550e-19    -2,16840434497101e-19   0   -7,15742840429884e-20
-0,532255026297144  0,0135340901236068  0,000479383336751935    0,000115847127348313    4,51920594555328e-05    0,00365346070706817 -0,00212282675610843    1,37219337455197e-19    -5,14996031930615e-19   3,30342849429177e-19    0   -2,71050543121376e-19   1,08420217248550e-19    0   0   5,08219768352580e-20
-0,384130052448431  0,00972113086608457 0,000342250536212794    8,21235560483452e-05    3,18129608485860e-05    1,56232096436654e-05    0,00206784220009096 -0,00131233595800525    6,39509875176997e-20    -3,37542629480839e-19   -1,08420217248550e-19   2,16840434497101e-19    0   0   0   -8,47032947254300e-22
-0,291692030052418  0,00735419164507677 0,000257375648850429    6,15185225200113e-05    2,37495210052671e-05    1,16038017329438e-05    6,53368676878396e-06    0,00128271813402154 -0,000867362869930264   1,77876918923403e-19    1,62630325872826e-19    -1,89735380184963e-19   1,62630325872826e-19    0   0   -9,07384044746169e-20
-0,229596895430646  0,00578230937666655 0,000201707743336976    4,79768824589291e-05    1,84020572663637e-05    8,96002707181433e-06    5,05525466995835e-06    3,12009781742606e-06    0,000850011219708818    -0,000602925394011745   0   2,71050543121376e-20    -8,13151629364128e-20   5,42101086242752e-20    -5,42101086242752e-20   7,73976355553617e-20
-0,185720949479909  0,00466765632076680 0,000162419592307734    3,85318721641536e-05    1,47407053519860e-05    7,17308297585328e-06    4,02178178072207e-06    2,48428717850195e-06    1,64547815065802e-06    0,000592092919336558    -0,000435974905284452   0   0   8,13151629364128e-20    -1,08420217248550e-19   2,64697796016969e-20
-0,153867987373140  0,00385473267086607 0,000133863548213241    3,17506489004575e-05    1,20962229586152e-05    5,86799087221288e-06    3,28276799988068e-06    2,02338706451671e-06    1,33735029942045e-06    9,34275734555363e-07    0,000428867197061432    -0,000325409609345764   0   2,71050543121376e-20    0   -1,09055491958991e-20
-0,129703518509601  0,00324211947468978 0,000112403568308126    2,65969300905272e-05    1,01402805713936e-05    4,89779294849866e-06    2,73496124917826e-06    1,68586638861081e-06    1,11012300345236e-06    7,73556738632873e-07    5,60933254708493e-07    0,000320553621268105    -0,000249293253625970   5,42101086242752e-20    0   -1,01114558078482e-20
-0,110691345431593  0,00276839969825208 9,59884298624889e-05    2,25961759289096e-05    8,63052307521336e-06    4,15554692230644e-06    2,31688356971108e-06    1,42511604039733e-06    9,39229137057347e-07    6,51934526276135e-07    4,72019315851685e-07    3,53897320062806e-07    0,000245863313382516    -0,000195185934120844   0   -1,24407964127975e-20
-0,0958269169656213 0,00239699666599593 8,28626202960276e-05    1,95227026042985e-05    7,41637441475814e-06    3,57424367962823e-06    1,99334817579930e-06    1,21993241781196e-06    8,05577604288488e-07    5,57554928001086e-07    4,03155267486669e-07    3,01723475812485e-07    2,31838854154289e-07    0,000192695260333710    -0,000155673036807333   -2,34522247271034e-20
-0,0838002301027703 0,00209415237243389 7,23249901251223e-05    1,70229067498473e-05    6,46008752692950e-06    3,11455737751181e-06    1,73159030599080e-06    1,06073213436631e-06    6,96842172109705e-07    4,82764206408816e-07    3,49217230232344e-07    2,60145440758586e-07    2,00286821017368e-07    1,56906945950947e-07    0,000153820426928509    -0,000126146355001072

问题似乎出在您的
gaussjordan
内核中

在原始(
L
)矩阵上执行高斯-乔丹消去法时,可以只对轴心点右侧的行元素进行消去

但是,当您将相同的行操作应用于标识矩阵以创建逆(
I
)时,有必要将等效的行操作应用于行的每个成员,而不仅仅是轴心点右侧的成员

因此,如果您像这样修改
gaussjordan
内核:

 __global__ void gaussjordan(float *A,  float *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    float P;

    if(x<n && y<n)
        if(x>i){ // this limits operation to rows below the pivot point
            P=A[x*n+i]/A[i*n+i];
            I[x*n+y] -= I[i*n+y]*P;  // apply for every row member
            if(y>=i){ //limits  to row members to the right of the pivot
                A[x*n+y] -= A[i*n+y]*P;  // apply only to members right of pivot
            }
        }
 }
\uuuu全局\uuuu无效gaussjordan(浮点*A,浮点*I,整数n,整数I)
{
intx=blockIdx.x*blockDim.x+threadIdx.x;
int y=blockIdx.y*blockDim.y+threadIdx.y;
浮动P;
如果(x=i){//限制轴右侧的行成员
A[x*n+y]-=A[i*n+y]*P;//仅适用于枢轴右侧的成员
}
}
}

我相信你会有更好的结果。通过上述更改,我能够在
浮动
双精度
的准确度范围内复制您的预期结果。我相信。

什么是“几乎正确的答案”?到底是什么问题?您还没有提供可运行的代码,那么我们怎么知道哪里出了问题呢?输出矩阵中的值的范围是(0.02,0.14),我的结果是(0.01,0.14)。在下面的循环中,只需分配和传输输入数据,在传输结果后再进行反建议:提供一个完整的代码,可以复制、粘贴、编译,而无需添加或更改任何内容,并包括一个简单的测试用例,如3x3矩阵,并显示代码的实际输出以及所需的输出。我更新了我的帖子。谢谢你的帮助!您是否打算为每个块启动一个线程和一个16x16块的网格?因为这就是你写代码的方式。内核中的
\uuu syncthreads()
没有任何作用。记住,threadblocks是以未定义的顺序执行的。非常感谢!我犯了一个多么简单的错误啊。我知道float和double,但cuda gpu不喜欢double(cuda功能1.0)。我相信
gaussjordan
内核不受竞争条件的影响,但我不确定
dev
内核不受竞争条件的影响。对,它通常可能是竞争条件。但是在这种情况下,数组的所有元素都在主对角线下,所以即使d_P[xh+x]的值改变了,也不会影响结果,但是:如果我做了P=d_P[xh+x],然后我将使用P而不是d_P[x*h+x],它会给我一个无竞争条件的代码。我说的对吗?@RobertCrovella OP是如何编译上述代码的。我使用常规的
FILE*fp
fprintf(fp,”)修改了函数
savetofile()
。我们还有别的事要做吗?OP代码中用户定义的标题
device\u launch\u parameters.h
有什么用途?
 __global__ void gaussjordan(float *A,  float *I,int n, int i)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    float P;

    if(x<n && y<n)
        if(x>i){ // this limits operation to rows below the pivot point
            P=A[x*n+i]/A[i*n+i];
            I[x*n+y] -= I[i*n+y]*P;  // apply for every row member
            if(y>=i){ //limits  to row members to the right of the pivot
                A[x*n+y] -= A[i*n+y]*P;  // apply only to members right of pivot
            }
        }
 }