Performance 如何使用openMP提高do循环的性能？_Performance_Loops_Fortran_Openmp_Do Loops

Performance 如何使用openMP提高do循环的性能？

performance loops fortran

Performance 如何使用openMP提高do循环的性能？,performance,loops,fortran,openmp,do-loops,Performance,Loops,Fortran,Openmp,Do Loops,如下图所示，此代码段旨在计算两个数组，即data\u real和data\u imag。它们的形状都是1024*10000。我想通过使用OpenMP加速DO循环的计算。但我绝对是openMP的初学者。我不太清楚如何使用依赖迭代并行化循环，例如下面代码段的语句temp2=dx*temp1（2*I），temp3=dx*temp1（2*I+1）。我的意思是，如果在这个代码片段中有竞争条件是否有如下所示的加速do循环的方法注意：Four1是用于执行FFT的子程序，sinc2是sinc函数的平方 !D

如下图所示，此代码段旨在计算两个数组，即

data\u real

和

data\u imag

。它们的形状都是1024*10000。我想通过使用OpenMP加速DO循环的计算。但我绝对是openMP的初学者。我不太清楚如何使用依赖迭代并行化循环，例如下面代码段的语句

temp2=dx*temp1（2*I），temp3=dx*temp1（2*I+1）

。我的意思是，如果在这个代码片段中有竞争条件

是否有如下所示的加速do循环的方法

注意：

Four1

是用于执行FFT的子程序，

sinc2

是sinc函数的平方

!Declare variables
Real, Allocatable, Dimension(:,:) :: data_complex, data_real, data_imag  
Real, Dimension(0:2*1024-1) :: temp1
Real :: temp2, temp3

!Allocate
Allocate( data_real(0:1024-1,0:10000-1),                    & 
         data_imag(0:1024-1,0:10000-1),                     &            
         data_complex(0:2*1024-1,0:10000-1), STAT=istat1 )

!Initialise 
data_real = 0.0
data_imag = 0.0
data_complex = 0.0
dk = 2*3.14159 / 75.0

!$OMP Parallel num_threads(24) private(i,j,k,temp1,temp2,temp3) shared( dk)
!$OMP Do schedule(dynamic,1) 
  Do j = 0, 10000-1 
    temp1(:) = data_complex(:,j)
    Call Four1(temp1, 1024, 1)                          ! Calling the subroutine 'Four1' to 
                                                        ! perform Fast Fourier Transform
    Do i = 0, 1023
      k = dk * Real(i)
      temp2 = dx * temp1(2*i)          
      temp3 = dx * temp1(2*i+1)        
      data_real(i,j) = temp2 / sinc2( dx * k / 2 )      ! sinc2(x) = sin(x) / x
      data_imag(i,j) = temp3 / sinc2( dx * k / 2 )          
    End Do 
  End Do
!$OMP End Do nowait
!$OMP End Parallel
! --------------------------------------------------------------- !
! ----------------------------------------------------------------!
Subroutine Four1(data_complex, nn, isign)

    Integer, Intent(in) :: nn
    Integer, Intent(in) :: isign
    Real, Intent(inout), Dimension(2*nn) :: data_complex
    Integer :: i, istep, j, m, mmax, n
    Real :: tempi, tempr
    Real(8) :: theta, wi, wpi, wpr, wr, wtemp
    ! ---------------------------------------------------------
    n=2*nn
    j=1
    Do i=1,n,2
      If(j>i) then
        tempr=data_complex(j)
        tempi=data_complex(j+1)
        data_complex(j)=data_complex(i)
        data_complex(j+1)=data_complex(i+1)
        data_complex(i)=tempr
        data_complex(i+1)=tempi
      endif
      m=n/2

      Do while ( (m>=2).and.(j>m) )
        j=j-m
        m=m/2
      End do

      j=j+m
    EndDo
    
    mmax=2
    Do while ( n > mmax )
       istep=2*mmax
       theta=(2*pi)/(isign*mmax)
       wpr=-2.0d0*sin(0.5d0*theta)**2
       wpi=sin(theta)
       wr=1.0d0
       wi=0.0d0
       Do m=1,mmax,2
         Do i=m,n,istep
           j=i+mmax
           tempr=Real(wr)*data_complex(j)-Real(wi)*data_complex(j+1)
           tempi=Real(wr)*data_complex(j+1)+Real(wi)*data_complex(j)
           data_complex(j)=data_complex(i)-tempr
           data_complex(j+1)=data_complex(i+1)-tempi
           data_complex(i)=data_complex(i)+tempr
           data_complex(i+1)=data_complex(i+1)+tempi
         End Do
         wtemp=wr
         wr=wr*wpr-wi*wpi+wr
         wi=wi*wpr+wtemp*wpi+wi
       End Do
       mmax=istep
     End Do

  End Subroutine Four1
  ! ------------------------------------------------------------ !
  ! ------------------------------------------------------------ !

  Real Function sinc2 ( x )
    !
    ! Define the square of sinc function
    !
    Real, Intent(in) :: x
    
    If ( abs(x) < 1.e-16 ) then
    ! be careful with comparison to real numbers because of rounding errors
    ! better: if (abs(x).lt.1.e-16) thensinc=1.
      sinc2 = 1.0
    Else 
      sinc2 = ( sin(x)/x )**2
    End If
  
  End Function sinc2

！声明变量
实型，可分配，维度（：，：）：：数据\复杂型，数据\实型，数据\ imag
实数，维数（0:2*1024-1）：：temp1
Real：：temp2，temp3
!分配
分配（实际数据（0:1024-1,0:10000-1）和
数据图像（0:1024-1,0:10000-1）和
数据_复合体（0:2*1024-1,0:10000-1），STAT=istat1）
!初始化
数据_real=0.0
数据_imag=0.0
数据_复合体=0.0
dk=2*3.14159/75.0
!$OMP并行数_线程（24个）私有（i、j、k、temp1、temp2、temp3）共享（dk）
!$OMP Do时间表（动态，1）
Do j=0，10000-1
temp1（：）=data_complex（：，j）
呼叫Four1（temp110241）！调用子例程“Four1”以
! 执行快速傅立叶变换
i=01023吗
k=dk*Real（i）
temp2=dx*temp1（2*i）
temp3=dx*temp1（2*i+1）
数据_real（i，j）=temp2/sinc2（dx*k/2）！sinc2（x）=sin（x）/x
数据图像（i，j）=temp3/sinc2（dx*k/2）
结束
结束
!$OMP End Do nowait
!$端并联
! --------------------------------------------------------------- !
! ----------------------------------------------------------------!
子例程Four1（数据_复合体，nn，isign）
整数，意图（in）：：nn
整数，意图（in）：：isign
真实、意图（inout）、维度（2*nn）：：数据复杂
整数：：i，istep，j，m，mmax，n
雷亚尔：坦皮，坦帕
实数（8）：θ，wi，wpi，wpr，wr，wtemp
! ---------------------------------------------------------
n=2*nn
j=1
i=1，n，2吗
如果（j>i）那么
tempr=数据_复合体（j）
tempi=数据_复合体（j+1）
数据_复合体（j）=数据_复合体（i）
数据_复合体（j+1）=数据_复合体（i+1）
数据_复合体（i）=温度
数据_复合体（i+1）=tempi
恩迪夫
m=n/2
做while（（m>=2）和（j>m））
j=j-m
m=m/2
结束
j=j+m
恩多
mmax=2
边做边做（n>mmax）
istep=2*mmax
θ=（2*pi）/（isign*mmax）
wpr=-2.0d0*sin（0.5d0*theta）**2
wpi=sin（θ）
wr=1.0d0
wi=0.0d0
dom=1，mmax，2
i=m，n，istep吗
j=i+mmax
tempr=Real（wr）*data_complex（j）-Real（wi）*data_complex（j+1）
tempi=实（wr）*数据+复数（j+1）+实（wi）*数据+复数（j）
数据_复合体（j）=数据_复合体（i）-tempr
数据_复合体（j+1）=数据_复合体（i+1）-tempi
数据复合物（i）=数据复合物（i）+温度
数据_复合体（i+1）=数据_复合体（i+1）+tempi
结束
wtemp=wr
wr=wr*wpr wi*wpi+wr
wi=wi*wpr+wtemp*wpi+wi
结束
mmax=istep
结束
结束子程序Four1
! ------------------------------------------------------------ !
! ------------------------------------------------------------ !
实函数sinc2（x）
!
! 定义sinc函数的平方
!
真实意图（in）：：x
如果（abs（x）<1.e-16），则
! 由于舍入误差，与实数比较时要小心
! 更好：如果（abs（x）.lt.1.e-16），那么sinc=1。
sinc2=1.0
其他的
sinc2=（sin（x）/x）**2
如果结束
端函数sinc2

当其他计算并行执行时，您可以使用部分或任务等待相关计算完成。@Isma您可以提供更多详细信息吗？请对所有Fortran问题使用tag，以引起Fortran专家的更多注意。你的问题并不是针对旧的Fortran 95。有几个建议我无法测试，因为你没有提供完整的程序：首先，我会使用一个好的FFT库，比如FFTW，而不是我怀疑的NR例程，可能会有所帮助。第二，虽然您的OpenMP看起来不错（如果我没有使用接近我将如何使用的库），但您确定您的硬件有效地支持24个线程吗？我不想硬连线线程的数量，而是使用OMP_NUM_线程，看看时间是如何随其值变化的。@IanBush我的机器有24个内核