Warning: file_get_contents(/data/phpspider/zhask/data//catemap/3/sockets/2.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Parallel processing 在Julia中使用并行处理填充矩阵_Parallel Processing_Julia_Dynamic Programming - Fatal编程技术网

Parallel processing 在Julia中使用并行处理填充矩阵

Parallel processing 在Julia中使用并行处理填充矩阵,parallel-processing,julia,dynamic-programming,Parallel Processing,Julia,Dynamic Programming,我试图通过并行处理来加快Julia(v.0.5.0)中动态规划问题的求解速度。该问题涉及在每次迭代中为1073 x 19矩阵的每个元素选择最佳值,直到连续矩阵差落在公差范围内。我认为,在每次迭代中,为矩阵的每个元素填充值都可以并行化。然而,我发现使用SharedArray会导致性能大幅下降,我想知道是否有更好的方法来解决这个问题 我为下面的函数构造参数: est_params = [.788,.288,.0034,.1519,.1615,.0041,.0077,.2,0.005,.719

我试图通过并行处理来加快Julia(v.0.5.0)中动态规划问题的求解速度。该问题涉及在每次迭代中为1073 x 19矩阵的每个元素选择最佳值,直到连续矩阵差落在公差范围内。我认为,在每次迭代中,为矩阵的每个元素填充值都可以并行化。然而,我发现使用
SharedArray
会导致性能大幅下降,我想知道是否有更好的方法来解决这个问题

我为下面的函数构造参数:

    est_params = [.788,.288,.0034,.1519,.1615,.0041,.0077,.2,0.005,.7196]

    r          = 0.015
    tau        = 0.35
    rho        =est_params[1]
    sigma      =est_params[2]
    delta      = 0.15
    gamma      =est_params[3]
    a_capital  =est_params[4]
    lambda1    =est_params[5]
    lambda2    =est_params[6]
    s          =est_params[7]
    theta      =est_params[8]
    mu         =est_params[9]
    p_bar_k_ss  =est_params[10]
    beta  = (1+r)^(-1)
    sigma_range = 4
    gz = 19 
    gp = 29 
    gk = 37 

    lnz=collect(linspace(-sigma_range*sigma,sigma_range*sigma,gz))
    z=exp(lnz)

    gk_m = fld(gk,2)
    # Need to add mu somewhere to k_ss
    k_ss =  (theta*(1-tau)/(r+delta))^(1/(1-theta))
    k=cat(1,map(i->k_ss*((1-delta)^i),collect(1:gk_m)),map(i->k_ss/((1-delta)^i),collect(1:gk_m)))
    insert!(k,gk_m+1,k_ss)
    sort!(k)
    p_bar=p_bar_k_ss*k_ss
    p = collect(linspace(-p_bar/2,p_bar,gp))

    #Tauchen
    N     = length(z)
    Z     = zeros(N,1)
    Zprob = zeros(Float32,N,N)

    Z[N]  = lnz[length(z)]
    Z[1]  = lnz[1]

    zstep = (Z[N] - Z[1]) / (N - 1)

    for i=2:(N-1)
        Z[i] = Z[1] + zstep * (i - 1)
    end

    for a = 1 : N
        for b = 1 : N
          if b == 1
              Zprob[a,b] = 0.5*erfc(-((Z[1] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2))
          elseif b == N
              Zprob[a,b] = 1 - 0.5*erfc(-((Z[N] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
          else
              Zprob[a,b] = 0.5*erfc(-((Z[b] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2)) -
                           0.5*erfc(-((Z[b] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
          end
        end
    end
    # Collecting tauchen results in a 2 element array of linspace and array; [2] gets array
    # Zprob=collect(tauchen(gz, rho, sigma, mu, sigma_range))[2]
    Zcumprob=zeros(Float32,gz,gz)
    # 2 in cumsum! denotes the 2nd dimension, i.e. columns
    cumsum!(Zcumprob, Zprob,2)


    gm = gk * gp

    control=zeros(gm,2)
    for i=1:gk
        control[(1+gp*(i-1)):(gp*i),1]=fill(k[i],(gp,1))
        control[(1+gp*(i-1)):(gp*i),2]=p
    end
    endog=copy(control)

    E=Array(Float32,gm,gm,gz)
    for h=1:gm
       for m=1:gm
           for j=1:gz
             # set the nonzero net debt indicator
              if endog[h,2]<0
                 p_ind=1

              else
                 p_ind=0
              end

               # set the investment indicator
                if (control[m,1]-(1-delta)*endog[h,1])!=0
                   i_ind=1
                else
                   i_ind=0
                end

                E[m,h,j] = (1-tau)*z[j]*(endog[h,1]^theta) + control[m,2]-endog[h,2]*(1+r*(1-tau))  +
                    delta*endog[h,1]*tau-(control[m,1]-(1-delta)*endog[h,1]) -
                     (i_ind*gamma*endog[h,1]+endog[h,1]*(a_capital/2)*(((control[m,1]-(1-delta)*endog[h,1])/endog[h,1])^2)) +
                    s*endog[h,2]*p_ind
                elem = E[m,h,j]
                if E[m,h,j]<0
                    E[m,h,j]=elem+lambda1*elem-.5*lambda2*elem^2
                else
                    E[m,h,j]=elem
                end
            end
        end
     end
在这个时候,我得到:

@time dynam_serial(E,gm,gz,beta,Zprob)

> 106.880008 seconds (91.70 M allocations: 203.233 GB, 15.22% gc time)
现在,我尝试使用共享阵列从并行处理中获益。请注意,我重新配置了迭代,因此对于循环,我只有一个
,而不是两个。我还使用
v=deepcopy(电视)
;否则,
v
将作为
Array
对象而不是
SharedArray
复制:

function dynam_parallel(E,gm,gz,beta,Zprob)
    v           = SharedArray(Float32,(gm,gz),init = S -> S[Base.localindexes(S)] = myid() )
    fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])

    # Set parameters for the loop
    convcrit = 0.0001   # chosen convergence criterion
    diff = 1          # arbitrary initial value greater than convcrit

    while diff>convcrit
      exp_v=v*Zprob'
      Tv          = SharedArray(Float32,gm,gz,init = S -> S[Base.localindexes(S)] = myid() )

      @sync @parallel for hj=1:(gm*gz)
         j=cld(hj,gm)
         h=mod(hj,gm)
         if h==0;h=gm;end;

         @async Tv[h,j]=findmax(E[:,h,j] + beta*exp_v[:,j])[1]
      end

      diff = maxabs(Tv - v)
      v=deepcopy(Tv)
    end
end
并行版本的定时;使用4核2.5 GHz I7处理器和16GB内存,我得到:

addprocs(3)
@time dynam_parallel(E,gm,gz,beta,Zprob)

> 164.237208 seconds (2.64 M allocations: 201.812 MB, 0.04% gc time)
我在这里做错什么了吗?或者,对于这个特殊的问题,有没有更好的方法在Julia中实现并行处理?我曾考虑过使用分布式阵列,但我很难看到如何将它们应用于当前的问题

更新: 根据@DanGetz和他有用的评论,我转而尝试加快串行处理版本。我能够通过以下方式将性能降低到
53.469780秒(67.36 M分配:103.419 GiB,19.12%的gc时间)

1) 升级到0.6.0(节省约25秒),其中包括有用的
@views

2) 根据Julia Performance Tips:中有关预分配输出的部分,预分配我尝试填充的主阵列(
Tv
)。(又节省了大约25秒)

剩下的最大减速似乎来自
add_vecs
函数,它将两个较大矩阵的子数组相加。我曾尝试过开发和使用BLAS函数,但未能产生更好的性能

在任何情况下,
dynam_serial
的改进代码如下:

function add_vecs(r::Array{Float32},h::Int,j::Int,E::Array{Float32},exp_v::Array{Float32},beta::Float32)

  @views r=E[:,h,j] + beta*exp_v[:,j]
  return r
end


function dynam_serial(E::Array{Float32},gm::Int,gz::Int,beta::Float32,Zprob::Array{Float32})
    v           = Array{Float32}(gm,gz)
    fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
    Tv          = Array{Float32}(gm,gz)
    r           = Array{Float32}(gm)

    # Set parameters for the loop
    convcrit = 0.0001   # chosen convergence criterion
    diff = 1          # arbitrary initial value greater than convcrit

    while diff>convcrit
      exp_v=v*Zprob'

      for h=1:gm
        for j=1:gz
          @views Tv[h,j]=findmax(add_vecs(r,h,j,E,exp_v,beta))[1]
        end
      end

      diff = maximum(abs,Tv - v)
      v=copy(Tv)
    end
    return Tv
end

如果
add_vecs
似乎是关键功能,那么为
循环编写一个明确的
,可以提供更多的优化。以下基准是如何确定的:

function add_vecs!(r::Array{Float32},h::Int,j::Int,E::Array{Float32},
                  exp_v::Array{Float32},beta::Float32)
    @inbounds for i=1:size(E,1)
        r[i]=E[i,h,j] + beta*exp_v[i,j]
    end
    return r
end
更新

要继续优化dynam_serial,我已尝试删除更多分配。结果是:

function add_vecs_and_max!(gm::Int,r::Array{Float64},h::Int,j::Int,E::Array{Float64},
                           exp_v::Array{Float64},beta::Float64)
    @inbounds for i=1:gm            
        r[i] = E[i,h,j]+beta*exp_v[i,j]
    end
    return findmax(r)[1]
end

function dynam_serial(E::Array{Float64},gm::Int,gz::Int,
                      beta::Float64,Zprob::Array{Float64})
    v           = Array{Float64}(gm,gz)
    fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
    r           = Array{Float64}(gm)
    exp_v       = Array{Float64}(gm,gz)

    # Set parameters for the loop
    convcrit = 0.0001   # chosen convergence criterion
    diff = 1.0 # arbitrary initial value greater than convcrit
    while diff>convcrit
        A_mul_Bt!(exp_v,v,Zprob)
        diff = -Inf
        for h=1:gm
            for j=1:gz
                oldv = v[h,j]
                newv = add_vecs_and_max!(gm,r,h,j,E,exp_v,beta)
                v[h,j]= newv
                diff = max(diff, oldv-newv, newv-oldv)
            end
        end
    end
    return v
end
将函数切换为使用Float64应该可以提高速度(因为CPU本身就针对64位字长度进行了优化)。另外,使用变异
A_mul_Bt直接保存另一个分配。通过切换阵列
v
Tv
避免
复制(…)

这些优化如何提高您的运行时间

第二次更新


更新更新部分中的代码以使用
findmax
。此外,将
dynam_serial
更改为使用
v
而不使用
Tv
,因为除了
diff
计算之外,无需保存旧版本,该计算现在在循环内部完成。

如果
添加向量
似乎是关键功能,为
循环编写一个显式的
,可以提供更多的优化。以下基准是如何确定的:

function add_vecs!(r::Array{Float32},h::Int,j::Int,E::Array{Float32},
                  exp_v::Array{Float32},beta::Float32)
    @inbounds for i=1:size(E,1)
        r[i]=E[i,h,j] + beta*exp_v[i,j]
    end
    return r
end
更新

要继续优化dynam_serial,我已尝试删除更多分配。结果是:

function add_vecs_and_max!(gm::Int,r::Array{Float64},h::Int,j::Int,E::Array{Float64},
                           exp_v::Array{Float64},beta::Float64)
    @inbounds for i=1:gm            
        r[i] = E[i,h,j]+beta*exp_v[i,j]
    end
    return findmax(r)[1]
end

function dynam_serial(E::Array{Float64},gm::Int,gz::Int,
                      beta::Float64,Zprob::Array{Float64})
    v           = Array{Float64}(gm,gz)
    fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
    r           = Array{Float64}(gm)
    exp_v       = Array{Float64}(gm,gz)

    # Set parameters for the loop
    convcrit = 0.0001   # chosen convergence criterion
    diff = 1.0 # arbitrary initial value greater than convcrit
    while diff>convcrit
        A_mul_Bt!(exp_v,v,Zprob)
        diff = -Inf
        for h=1:gm
            for j=1:gz
                oldv = v[h,j]
                newv = add_vecs_and_max!(gm,r,h,j,E,exp_v,beta)
                v[h,j]= newv
                diff = max(diff, oldv-newv, newv-oldv)
            end
        end
    end
    return v
end
将函数切换为使用Float64应该可以提高速度(因为CPU本身就针对64位字长度进行了优化)。另外,使用变异
A_mul_Bt直接保存另一个分配。通过切换阵列
v
Tv
避免
复制(…)

这些优化如何提高您的运行时间

第二次更新


更新更新部分中的代码以使用
findmax
。另外,将
dynam_serial
更改为使用
v
而不使用
Tv
,因为除了
diff
计算之外,无需保存旧版本,该计算现在在循环内部完成。

这是我复制和粘贴的代码,由Dan Getz提供。我完全按照运行数组和标量定义的方式包含它们。性能是:
39.507005秒(11次分配:486.891kib)
运行
@time-dynam\u-serial(E、gm、gz、beta、Zprob)


这是我复制和粘贴的代码,由上面的Dan Getz提供。我完全按照运行数组和标量定义的方式包含它们。性能是:
39.507005秒(11次分配:486.891kib)
运行
@time-dynam\u-serial(E、gm、gz、beta、Zprob)


也许先优化非并行版本?矩阵有约20000个元素,即160KB,但计时显示它分配了数百万次,大约是矩阵大小的1000倍。应该有很大的改进空间。此外,使用最新的Julia版本和矢量化的点表示法也是一大好处。
dynam_serial
返回什么?目前,您可以通过简单地不运行它来优化它,因为它没有任何外部影响,并且不会返回任何HI Dan,谢谢您的评论。我截断了函数只是为了说明问题。最终,它返回矩阵
v
(如图所示)以及决策规则(未显示),这些规则稍后将在模拟中使用(也未显示)。我将尽我所能,首先尝试优化非并行版本,并升级到最新版本。还要注意的是,虽然矩阵是~20000个元素,但这20000个元素会在多次迭代中填充,直到收敛。Dan,你的评论非常有用。我继续致力于预分配每个迭代中使用的数组,并将性能提高到
60.107114秒
using SpecialFunctions
est_params = [.788,.288,.0034,.1519,.1615,.0041,.0077,.2,0.005,.7196]

r          = 0.015
tau        = 0.35
rho        =est_params[1]
sigma      =est_params[2]
delta      = 0.15
gamma      =est_params[3]
a_capital  =est_params[4]
lambda1    =est_params[5]
lambda2    =est_params[6]
s          =est_params[7]
theta      =est_params[8]
mu         =est_params[9]
p_bar_k_ss  =est_params[10]
beta  = Float32((1+r)^(-1))
sigma_range = 4
gz = 19
gp = 29
gk = 37

lnz=collect(linspace(-sigma_range*sigma,sigma_range*sigma,gz))
z=exp(lnz)

gk_m = fld(gk,2)
# Need to add mu somewhere to k_ss
k_ss =  (theta*(1-tau)/(r+delta))^(1/(1-theta))
k=cat(1,map(i->k_ss*((1-delta)^i),collect(1:gk_m)),map(i->k_ss/((1-delta)^i),collect(1:gk_m)))
insert!(k,gk_m+1,k_ss)
sort!(k)
p_bar=p_bar_k_ss*k_ss
p = collect(linspace(-p_bar/2,p_bar,gp))

#Tauchen
N     = length(z)
Z     = zeros(N,1)
Zprob = zeros(Float32,N,N)

Z[N]  = lnz[length(z)]
Z[1]  = lnz[1]

zstep = (Z[N] - Z[1]) / (N - 1)

for i=2:(N-1)
    Z[i] = Z[1] + zstep * (i - 1)
end

for a = 1 : N
    for b = 1 : N
      if b == 1
          Zprob[a,b] = 0.5*erfc(-((Z[1] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2))
      elseif b == N
          Zprob[a,b] = 1 - 0.5*erfc(-((Z[N] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
      else
          Zprob[a,b] = 0.5*erfc(-((Z[b] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2)) -
                       0.5*erfc(-((Z[b] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
      end
    end
end
# Collecting tauchen results in a 2 element array of linspace and array; [2] gets array
# Zprob=collect(tauchen(gz, rho, sigma, mu, sigma_range))[2]
Zcumprob=zeros(Float32,gz,gz)
# 2 in cumsum! denotes the 2nd dimension, i.e. columns
cumsum!(Zcumprob, Zprob,2)


gm = gk * gp

control=zeros(gm,2)
for i=1:gk
    control[(1+gp*(i-1)):(gp*i),1]=fill(k[i],(gp,1))
    control[(1+gp*(i-1)):(gp*i),2]=p
end
endog=copy(control)

E=Array(Float32,gm,gm,gz)
for h=1:gm
   for m=1:gm
       for j=1:gz
         # set the nonzero net debt indicator
          if endog[h,2]<0
             p_ind=1

          else
             p_ind=0
          end

           # set the investment indicator
            if (control[m,1]-(1-delta)*endog[h,1])!=0
               i_ind=1
            else
               i_ind=0
            end

            E[m,h,j] = (1-tau)*z[j]*(endog[h,1]^theta) + control[m,2]-endog[h,2]*(1+r*(1-tau))  +
                delta*endog[h,1]*tau-(control[m,1]-(1-delta)*endog[h,1]) -
                 (i_ind*gamma*endog[h,1]+endog[h,1]*(a_capital/2)*(((control[m,1]-(1-delta)*endog[h,1])/endog[h,1])^2)) +
                s*endog[h,2]*p_ind
            elem = E[m,h,j]
            if E[m,h,j]<0
                E[m,h,j]=elem+lambda1*elem-.5*lambda2*elem^2
            else
                E[m,h,j]=elem
            end
        end
    end
 end

 function add_vecs!(gm::Int,r::Array{Float32},h::Int,j::Int,E::Array{Float32},
                   exp_v::Array{Float32},beta::Float32)

     @inbounds @views for i=1:gm
         r[i]=E[i,h,j] + beta*exp_v[i,j]
     end
     return r
 end

 function dynam_serial(E::Array{Float32},gm::Int,gz::Int,beta::Float32,Zprob::Array{Float32})
     v           = Array{Float32}(gm,gz)
     fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])

     Tv          = Array{Float32}(gm,gz)

     # Set parameters for the loop
     convcrit = 0.0001   # chosen convergence criterion
     diff = 1.00000          # arbitrary initial value greater than convcrit
     iter=0
     exp_v=Array{Float32}(gm,gz)

     r=Array{Float32}(gm)

     while diff>convcrit


       A_mul_Bt!(exp_v,v,Zprob)

       for h=1:gm
         for j=1:gz
           Tv[h,j]=findmax(add_vecs!(gm,r,h,j,E,exp_v,beta))[1]
         end
       end
       diff = maximum(abs,Tv - v)
       (v,Tv)=(Tv,v)
     end
     return v
 end