Parallel processing 用高斯消去法求解对称线性方程ax=b_Parallel Processing_Fortran_Openmp

Parallel processing 用高斯消去法求解对称线性方程ax=b

parallel-processing fortran

Parallel processing 用高斯消去法求解对称线性方程ax=b,parallel-processing,fortran,openmp,Parallel Processing,Fortran,Openmp,我是并行编程新手，目前正致力于优化与电磁计算相关的代码。通过分析程序的工作原理，我意识到85%的执行时间都花在解线性方程上。我研究了一点openmp，但我不知道如何并行化这样的嵌套循环。有什么想法吗？先谢谢你。遵循下面的代码 Subroutine GaussEqSolver_Sym(n,ma,a,b,ep,kwji) !------------------------------------------------------------------ ! Solve sysmm

我是并行编程新手，目前正致力于优化与电磁计算相关的代码。通过分析程序的工作原理，我意识到85%的执行时间都花在解线性方程上。我研究了一点openmp，但我不知道如何并行化这样的嵌套循环。有什么想法吗？先谢谢你。遵循下面的代码

    Subroutine GaussEqSolver_Sym(n,ma,a,b,ep,kwji)
!------------------------------------------------------------------
!  Solve sysmmetric linear equation ax=b by using Gauss elimination.
!  If kwji=1, no solution;if kwji=0,has solution
!  Input--n,ma,a(ma,n),b(n),ep,
!  Output--b,kwji
!------------------------------------------------------------------
   

        implicit real*8 (a-h,o-z)
           dimension a(ma,n),b(n),m(n+1)
           do 10 i=1,n
    10     m(i)=i
           do 120 k=1,n
              p=0.0
              do 20 i=k,n
                 do 20 j=k,n
                    if(dabs(a(i,j)).gt.dabs(p)) then
                       p=a(i,j)
                       io=i
                       jo=j
                    endif
    20        continue
              if(dabs(p)-ep) 30,30,35
    30        kwji=1
              return
    35        continue
              if(jo.eq.k) go to 45
              do 40 i=1,n
                 t=a(i,jo)
                 a(i,jo)=a(i,k)
                 a(i,k)=t
    40        continue
              j=m(k)
              m(k)=m(jo)
              m(jo)=j
    45        if(io.eq.k) go to 55
              do 50 j=k,n
                 t=a(io,j)
                 a(io,j)=a(k,j)
                 a(k,j)=t
    50        continue
              t=b(io)
              b(io)=b(k)
              b(k)=t
    55        p=1./p
              in=n-1
              if(k.eq.n) go to 65
              do 60 j=k,in
    60        a(k,j+1)=a(k,j+1)*p
    65        b(k)=b(k)*p
              if(k.eq.n) go to 120
              do 80 i=k,in
                 do 70 j=k,in
    70              a(i+1,j+1)=a(i+1,j+1)-a(i+1,k)*a(k,j+1)
    80              b(i+1)=b(i+1)-a(i+1,k)*b(k)
    120       continue
              do 130 i1=2,n
                 i=n+1-i1
                 do 130 j=i,in
    130       b(i)=b(i)-a(i,j+1)*b(j+1)
              do 140 k=1,n
                 i=m(k)
    140       a(1,i)=b(k)
              do 150 k=1,n
    150       b(k)=a(1,k)
              kwji=0
        return
        end

如果您对性能感兴趣，您应该使用LAPACK。为了说明这一点，我编写了一个简单的驱动程序，用于比较调用DSYSV时提供的代码的速度。DSYSV是一个LAPACK例程，用于求解对称“双精度”矩阵的一组线性方程组。代码和结果如下所示，但总而言之，LAPACK的速度是Fortran的3.3倍，是Fortran的725倍。请注意，这可能不是一个优化的LAPACK库，而是我在笔记本电脑上安装的Mint Linux附带的任何库。无论如何，详情如下

ian@eris:~/work/stack$ cat solve.f90
    Subroutine GaussEqSolver_Sym(n,ma,a,b,ep,kwji)
!------------------------------------------------------------------
!  Solve sysmmetric linear equation ax=b by using Gauss elimination.
!  If kwji=1, no solution;if kwji=0,has solution
!  Input--n,ma,a(ma,n),b(n),ep,
!  Output--b,kwji
!------------------------------------------------------------------
   

        implicit real*8 (a-h,o-z)
           dimension a(ma,n),b(n),m(n+1)
           do 10 i=1,n
    10     m(i)=i
           do 120 k=1,n
              p=0.0
              do 20 i=k,n
                 do 20 j=k,n
                    if(dabs(a(i,j)).gt.dabs(p)) then
                       p=a(i,j)
                       io=i
                       jo=j
                    endif
    20        continue
              if(dabs(p)-ep) 30,30,35
    30        kwji=1
              return
    35        continue
              if(jo.eq.k) go to 45
              do 40 i=1,n
                 t=a(i,jo)
                 a(i,jo)=a(i,k)
                 a(i,k)=t
    40        continue
              j=m(k)
              m(k)=m(jo)
              m(jo)=j
    45        if(io.eq.k) go to 55
              do 50 j=k,n
                 t=a(io,j)
                 a(io,j)=a(k,j)
                 a(k,j)=t
    50        continue
              t=b(io)
              b(io)=b(k)
              b(k)=t
    55        p=1./p
              in=n-1
              if(k.eq.n) go to 65
              do 60 j=k,in
    60        a(k,j+1)=a(k,j+1)*p
    65        b(k)=b(k)*p
              if(k.eq.n) go to 120
              do 80 i=k,in
                 do 70 j=k,in
    70              a(i+1,j+1)=a(i+1,j+1)-a(i+1,k)*a(k,j+1)
    80              b(i+1)=b(i+1)-a(i+1,k)*b(k)
    120       continue
              do 130 i1=2,n
                 i=n+1-i1
                 do 130 j=i,in
    130       b(i)=b(i)-a(i,j+1)*b(j+1)
              do 140 k=1,n
                 i=m(k)
    140       a(1,i)=b(k)
              do 150 k=1,n
    150       b(k)=a(1,k)
              kwji=0
        return
     end 

Program solve_eqns

  Use, Intrinsic :: iso_fortran_env, Only :  wp => real64, li => int64

  Implicit None

  Real( wp ), Dimension( :, : ), Allocatable :: a, a_copy

  Real( wp ), Dimension( : ), Allocatable :: b
  Real( wp ), Dimension( : ), Allocatable :: x_lap, x_for
  Real( wp ), Dimension( : ), Allocatable :: work

  Real( wp ) :: time_lap, time_for
  
  Integer, Dimension( : ), Allocatable :: pivots

  Integer :: i
  Integer :: n, nb = 64 ! hack value for nb - should use ilaenv
  Integer :: error

  Integer( li ) :: start, finish, rate

  Write( *, * ) 'n ?'
  Read ( *, * )  n

  Allocate( a( 1:n, 1:n ) )
  Allocate( b( 1:n ) )
  Allocate( pivots( 1:n ) )

  ! Set up matrix
  Call Random_number( a )
  a = a - 0.5_wp
  ! Make A symmetric
  a = 0.5_wp * ( a + Transpose( a ) )
  ! Add n to diagonal of A to avoid any nasty condition numbers
  Do i = 1, n
     a( i, i ) = a( i, i ) + n
  End Do
  ! And keep a back up of A
  a_copy = a

  ! RHS
  Call Random_number( b )

  ! Solve with LAPACK
  x_lap = b
  Allocate( work( 1:n * nb ) )
  Call system_clock( start, rate )
  Call dsysv( 'U', n, 1, a, Size( a, dim = 1 ), pivots, &
       x_lap, Size( x_lap, Dim = 1 ), work, Size( work ), error )
  Call system_clock( finish, rate )
  time_lap = Real( finish - start, Kind( time_lap ) ) / rate
  ! Restore A
  a = a_copy
  Write( *, * ) 'Errors for LAPACK', error, Maxval( Abs( Matmul( a, x_lap ) - b ) )
  Write( *, * ) 'Time for LAPACK', time_lap

  ! Solve with Fortran  
  x_for = b
  Call system_clock( start, rate )
  Call GaussEqSolver_Sym( Size( a, Dim = 2 ), Size( a, Dim = 1 ), a, x_for, Epsilon( a ), error )
  Call system_clock( finish, rate )
  time_for = Real( finish - start, Kind( time_for ) ) / rate
  ! Restore A
  a = a_copy
  Write( *, * ) 'Errors for Fortran', error, Maxval( Abs( Matmul( a, x_for ) - b ) )
  Write( *, * ) 'Time_For for Fortran', time_for

  Write( *, * ) 'Max difference in solutions', Maxval( Abs( x_lap - x_for ) )

  Write( *, * ) 'LAPACK is ', time_for / time_lap, ' times quicker than the Fortran'
  
       
End Program solve_eqns
ian@eris:~/work/stack$ gfortran --version
GNU Fortran (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

ian@eris:~/work/stack$ gfortran -O3 solve.f90  -llapack
ian@eris:~/work/stack$ ./a.out
 n ?
100
 Errors for LAPACK           0   4.4408920985006262E-016
 Time for LAPACK   1.5952670000000000E-003
 Errors for Fortran           0   9.9920072216264089E-016
 Time_For for Fortran   5.3095140000000004E-003
 Max difference in solutions   8.6736173798840355E-018
 LAPACK is    3.3282917530419676       times quicker than the Fortran
ian@eris:~/work/stack$ ./a.out
 n ?
1000
 Errors for LAPACK           0   1.3322676295501878E-015
 Time for LAPACK   3.9014976000000000E-002
 Errors for Fortran           0   4.9960036108132044E-015
 Time_For for Fortran   1.9314730620000000     
 Max difference in solutions   4.7704895589362195E-018
 LAPACK is    49.505940026722044       times quicker than the Fortran
ian@eris:~/work/stack$ ./a.out
 n ?
5000 
 Errors for LAPACK           0   4.3298697960381105E-015
 Time for LAPACK   1.2611959250000000     
 Errors for Fortran           0   1.3322676295501878E-014
 Time_For for Fortran   913.76959534100001     
 Max difference in solutions   2.7647155398380363E-018
 LAPACK is    724.52628273517462       times quicker than the Fortran

正如我在对你以前的线性代数问题的评论中所说的，现在你不需要自己编写代码，你需要使用一个很好的LAPACK实现，比如MKL。它将比您自己编写的任何东西都更快、更稳定，并且将与线程并行。假设你有一个通用的矩阵DGESV可能是你想要的例程。虽然我同意@IanBush的观点，对于高效的并行代码，你最好使用一个现成的库（更好），但它不会教你太多关于并行代码的知识，这可能是一个很好的目标。然而，正如我对这个问题的早期版本所做的评论，我认为在尝试并行化之前，您需要对代码进行现代化。取消使用

continue

和

goto

不仅仅是一种现代主义时尚，它会使代码在尝试并行化时更加清晰，并且需要您更彻底地理解代码。哦，放弃隐式键入并使用

implicit none

——这将保护您在重构过程中避免一些严重的潜在错误。（如果我们讨论要放弃的内容，算术If将在列表中占据相当高的位置。）不幸的是，我只能向@francescalus投票一次。。。