Fortran 为什么一个带有数组的子例程；使用模块“；语句比相同的子例程（本地大小的数组）提供更快的性能？_Fortran_Gfortran

Fortran 为什么一个带有数组的子例程；使用模块“；语句比相同的子例程（本地大小的数组）提供更快的性能？

fortran

Fortran 为什么一个带有数组的子例程；使用模块“；语句比相同的子例程（本地大小的数组）提供更快的性能？,fortran,gfortran,Fortran,Gfortran,与此相关，但我相信这个例子更清楚地说明了问题我有一些遗留代码如下所示： subroutine ID_OG(N, DETERM) use variables, only: ID implicit real (A-H,O-Z) implicit integer(I-N) DETERM = 1.0 DO 1 I=1,N 1 ID(I)=0 DETERM = sum(ID) end subroutine ID_OG 将仅使用变量：ID替换为实数，维度（N）：：I

与此相关，但我相信这个例子更清楚地说明了问题

我有一些遗留代码如下所示：

subroutine ID_OG(N, DETERM)
  use variables, only: ID
  implicit real (A-H,O-Z)
  implicit integer(I-N)

  DETERM = 1.0
  DO 1 I=1,N
1       ID(I)=0
  DETERM = sum(ID)
end subroutine ID_OG

将

仅使用变量：ID

替换为

实数，维度（N）：：ID

或

实数，维度（：），可分配：：ID

会导致明显的性能损失。为什么会这样？这是预期的行为吗？我想知道这是否与需要重复为本地数组

ID

分配内存的程序有关，而

use

语句允许程序跳过内存分配步骤

在传统代码中，

ID

位于

模块变量中

，但它仅在子例程

ID\u OG

中使用。它不在代码中的任何其他地方使用-它不是输入或输出。在我看来，从

模块变量

中删除

ID

并在子程序中本地定义似乎是一种很好的编程实践。但也许情况并非如此

最小工作示例（MWE）：使用gfortran 8.2.0编译为gfortran-O3 test.f95

MODULE variables
  implicit none

  real, dimension(:),   allocatable :: ID

END MODULE variables


program test
  use variables

  implicit none

  integer             :: N
  integer             :: loop_max = 1e6
  integer             :: ii                    ! loop index
  real                :: DETERM

  real :: t1, t2
  real :: t_ID_OG, t_ID_header, t_ID_no_ID, t_OG_no_ID, t_allocate

  character(*), parameter :: format_header = '((A5, 1X), 20(A12,1X))'
  character(*), parameter :: format_data = '((I5, 1X), 20(ES12.5, 1X))'

  open(1, file = 'TimingSubroutines_ID.txt', status = 'unknown')
  write(1,format_header) 'N', 't_Legacy', 't_header', 't_head_No_ID', 't_Leg_no_ID', &
                            & 't_allocate'

  do N = 1, 100

    allocate(ID(N))


    call CPU_time(t1)
    do ii = 1, loop_max
      CALL ID_OG(N, DETERM)
    end do
    call CPU_time(t2)
    t_ID_OG = t2 - t1
    print*, N, DETERM


    call CPU_time(t1)
    do ii = 1, loop_max
      CALL ID_header(N, DETERM)
    end do
    call CPU_time(t2)
    t_ID_header = t2 - t1
    print*, N, DETERM


    call CPU_time(t1)
    do ii = 1, loop_max
      CALL ID_header_no_ID(N, DETERM)
    end do
    call CPU_time(t2)
    t_ID_no_ID = t2 - t1
    print*, N, DETERM


    call CPU_time(t1)
    do ii = 1, loop_max
      CALL ID_OG_no_ID(N, DETERM)
    end do
    call CPU_time(t2)
    t_OG_no_ID = t2 - t1
    print*, N, DETERM


    call CPU_time(t1)
    do ii = 1, loop_max
      CALL ID_OG_allocate(N, DETERM)
    end do
    call CPU_time(t2)
    t_allocate = t2 - t1
    print*, N, DETERM


    deallocate(ID)
    write(1,format_data) N, t_ID_OG, t_ID_header, t_ID_no_ID, t_OG_no_ID, t_allocate

  end do



end program test


subroutine ID_OG(N, DETERM)
  use variables, only: ID
  implicit real (A-H,O-Z)
  implicit integer(I-N)


  DETERM = 1.0
  DO 1 I=1,N
1       ID(I)=0
  DETERM = sum(ID)

end subroutine ID_OG



subroutine ID_header(N, DETERM)
  use variables, only: ID
  implicit none

  integer, intent(in)  :: N
  real,    intent(out) :: DETERM
  integer              :: I


  DETERM = 1.0
  DO 1 I=1,N
1       ID(I)=0
  DETERM = sum(ID)

end subroutine ID_header



subroutine ID_header_no_ID(N, DETERM)
  implicit none

  integer, intent(in)  :: N
  real,    intent(out) :: DETERM
  integer              :: I
  real, dimension(N)   :: ID


  DETERM = 1.0
  DO 1 I=1,N
1       ID(I)=0
  DETERM = sum(ID)

end subroutine ID_header_no_ID


subroutine ID_OG_no_ID(N, DETERM)
  implicit real (A-H,O-Z)
  implicit integer(I-N)
  real, dimension(N)   :: ID


  DETERM = 1.0
  DO 1 I=1,N
1       ID(I)=0
  DETERM = sum(ID)

end subroutine ID_OG_no_ID


subroutine ID_OG_allocate(N, DETERM)
  implicit real (A-H,O-Z)
  implicit integer(I-N)
  real, dimension(:), allocatable :: ID

  allocate(ID(N))


  DETERM = 1.0
  DO 1 I=1,N
1       ID(I)=0
  DETERM = sum(ID)

end subroutine ID_OG_allocate

分配阵列需要时间。编译器可以根据需要自由分配本地数组，但通常可以通过编译器特定的标志进行调整。使用gfortran的

-fstack array

强制本地数组堆叠

在堆栈上分配只是更改堆栈指针，实际上是免费的。然而，在堆上分配更复杂，需要一些簿记

有些情况下局部变量有序，有些情况下全局（模块）变量有序。还可以使用本地保存的变量或作为某些对象组件的变量如果看不到相关代码的完整设计，就不能说哪一个更好。

FWIW，使用

-fstack数组

除了使用

allocate（）显式分配时，我看不出有什么区别：

Explicitallocate
将始终使用堆
如果没有-fstack阵列
，我确实看到了一些：

因为我的笔记本同时运行许多进程，所以图表非常嘈杂

这并不是说人们应该总是使用-fstack数组
，我曾经演示过这一区别。该选项很有用，但必须注意避免堆栈溢出错误<代码>-fmax堆栈变量大小可能会有所帮助。
分配数组需要时间。编译器可以根据需要自由分配本地数组，但通常可以通过编译器特定的标志进行调整。使用gfortran的-fstack array
强制本地数组堆叠
在堆栈上分配只是更改堆栈指针，实际上是免费的。然而，在堆上分配更复杂，需要一些簿记
有些情况下局部变量有序，有些情况下全局（模块）变量有序。还可以使用本地保存的变量或作为某些对象组件的变量如果看不到相关代码的完整设计，就不能说哪一个更好。
FWIW，使用-fstack数组
除了使用allocate（）显式分配时，我看不出有什么区别：

Explicitallocate
将始终使用堆
如果没有-fstack阵列
，我确实看到了一些：

因为我的笔记本同时运行许多进程，所以图表非常嘈杂

这并不是说人们应该总是使用-fstack数组
，我曾经演示过这一区别。该选项很有用，但必须注意避免堆栈溢出错误-fmax stack var size
可能对此有所帮助。
正如您的测试所指出的，所有不使用模块
变量的方法的额外开销是由于该语言的发展观，即不会因内存处理过多而困扰用户
编译器将决定应该在哪里分配内存，除非您开始修改编译器标志。您认为分配/释放时间是一个缺点，但您的分析也表明：

堆栈与堆内存处理开销很快变得越来越小：对于N>=100
，正如您的测试所指出的，所有不使用模块
变量的方法的额外开销都是由于该语言的发展观，即不需要过多的内存处理来打扰用户
编译器将决定应该在哪里分配内存，除非您开始修改编译器标志。您认为分配/释放时间是一个缺点，但您的分析也表明：

堆栈与堆内存的处理开销很快变得越来越小：对于N>=100
，这已经很有用了，谢谢。我也看到使用变量
和真实维度（N）：：ID
之间有类似的性能。我注意到，您的绘图只有4个数据集，而我的绘图有第5个数据集，使用real，dimension（：），allocatable:：ID
。我注意到这个版本在使用标志-fstack array
时性能仍然较差。这也是您看到的吗？@NickBrady是的，如果您显式分配，它将始终进入堆，没有其他方法。理论上，编译器可以做一些终身静态分析，但不太可能实现。这非常有用，谢谢。我也看到使用变量
和真实维度（N）：：ID
之间有类似的性能。我注意到，您的绘图只有4个数据集，而我的绘图有第5个数据集，使用real，dimension（：），allocatable:：ID。我注意到
module myCalculation
    implicit none

    type, public: fancyMethod
        integer :: N = 0
        real, allocatable :: ID(:)

    contains
       
        procedure :: init 
        procedure :: compute
        procedure :: is_init
             
    end type fancyMethod

contains

    elemental subroutine init(self,n)
        class(fancyMethod), intent(inout) :: self
        integer, intent(in) :: n
        real, allocatable :: tmp(:)

        self%N = n
        allocate(tmp(N)); tmp(:) = 0
        call move_alloc(from=tmp,to=self%ID)
    end subroutine init

    elemental logical function is_init(self) 
        class(fancyMethod), intent(in) :: self
        is_init = allocated(self%ID) .and. size(self%ID)>0
    end function is_init

    real function compute(self,n,...) result(DETERM)
        class(fancyMethod), intent(inout) :: self
        integer, intent(in) :: n
        ....

        if (.not.is_init(self)) call init(self,N)
 
        DETERM = sum(self%ID(1:N)) 
    end function compute

end module myCalculation

real function computeWithMaxSize(N) result(DETERM)
    integer, intent(in) :: N
    integer, parameter :: MAX_SIZE = 1024
    real :: ID(MAX_SIZE)

    [...]

    if (N>MAX_SIZE) stop ' N is too large! '

    DETERM = sum(ID(1:N))
end function computeWithMaxSize

real function computeWithAllocatable(N) result(DETERM)
    integer, intent(in) :: N
    real, allocatable :: ID(:)

    allocate(ID(N))
 
    [...]

    DETERM = sum(ID(1:N))
end function computeWithAllocatable