Parallel processing 在Julia中使用并行处理填充矩阵
我试图通过并行处理来加快Julia(v.0.5.0)中动态规划问题的求解速度。该问题涉及在每次迭代中为1073 x 19矩阵的每个元素选择最佳值,直到连续矩阵差落在公差范围内。我认为,在每次迭代中,为矩阵的每个元素填充值都可以并行化。然而,我发现使用Parallel processing 在Julia中使用并行处理填充矩阵,parallel-processing,julia,dynamic-programming,Parallel Processing,Julia,Dynamic Programming,我试图通过并行处理来加快Julia(v.0.5.0)中动态规划问题的求解速度。该问题涉及在每次迭代中为1073 x 19矩阵的每个元素选择最佳值,直到连续矩阵差落在公差范围内。我认为,在每次迭代中,为矩阵的每个元素填充值都可以并行化。然而,我发现使用SharedArray会导致性能大幅下降,我想知道是否有更好的方法来解决这个问题 我为下面的函数构造参数: est_params = [.788,.288,.0034,.1519,.1615,.0041,.0077,.2,0.005,.719
SharedArray
会导致性能大幅下降,我想知道是否有更好的方法来解决这个问题
我为下面的函数构造参数:
est_params = [.788,.288,.0034,.1519,.1615,.0041,.0077,.2,0.005,.7196]
r = 0.015
tau = 0.35
rho =est_params[1]
sigma =est_params[2]
delta = 0.15
gamma =est_params[3]
a_capital =est_params[4]
lambda1 =est_params[5]
lambda2 =est_params[6]
s =est_params[7]
theta =est_params[8]
mu =est_params[9]
p_bar_k_ss =est_params[10]
beta = (1+r)^(-1)
sigma_range = 4
gz = 19
gp = 29
gk = 37
lnz=collect(linspace(-sigma_range*sigma,sigma_range*sigma,gz))
z=exp(lnz)
gk_m = fld(gk,2)
# Need to add mu somewhere to k_ss
k_ss = (theta*(1-tau)/(r+delta))^(1/(1-theta))
k=cat(1,map(i->k_ss*((1-delta)^i),collect(1:gk_m)),map(i->k_ss/((1-delta)^i),collect(1:gk_m)))
insert!(k,gk_m+1,k_ss)
sort!(k)
p_bar=p_bar_k_ss*k_ss
p = collect(linspace(-p_bar/2,p_bar,gp))
#Tauchen
N = length(z)
Z = zeros(N,1)
Zprob = zeros(Float32,N,N)
Z[N] = lnz[length(z)]
Z[1] = lnz[1]
zstep = (Z[N] - Z[1]) / (N - 1)
for i=2:(N-1)
Z[i] = Z[1] + zstep * (i - 1)
end
for a = 1 : N
for b = 1 : N
if b == 1
Zprob[a,b] = 0.5*erfc(-((Z[1] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2))
elseif b == N
Zprob[a,b] = 1 - 0.5*erfc(-((Z[N] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
else
Zprob[a,b] = 0.5*erfc(-((Z[b] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2)) -
0.5*erfc(-((Z[b] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
end
end
end
# Collecting tauchen results in a 2 element array of linspace and array; [2] gets array
# Zprob=collect(tauchen(gz, rho, sigma, mu, sigma_range))[2]
Zcumprob=zeros(Float32,gz,gz)
# 2 in cumsum! denotes the 2nd dimension, i.e. columns
cumsum!(Zcumprob, Zprob,2)
gm = gk * gp
control=zeros(gm,2)
for i=1:gk
control[(1+gp*(i-1)):(gp*i),1]=fill(k[i],(gp,1))
control[(1+gp*(i-1)):(gp*i),2]=p
end
endog=copy(control)
E=Array(Float32,gm,gm,gz)
for h=1:gm
for m=1:gm
for j=1:gz
# set the nonzero net debt indicator
if endog[h,2]<0
p_ind=1
else
p_ind=0
end
# set the investment indicator
if (control[m,1]-(1-delta)*endog[h,1])!=0
i_ind=1
else
i_ind=0
end
E[m,h,j] = (1-tau)*z[j]*(endog[h,1]^theta) + control[m,2]-endog[h,2]*(1+r*(1-tau)) +
delta*endog[h,1]*tau-(control[m,1]-(1-delta)*endog[h,1]) -
(i_ind*gamma*endog[h,1]+endog[h,1]*(a_capital/2)*(((control[m,1]-(1-delta)*endog[h,1])/endog[h,1])^2)) +
s*endog[h,2]*p_ind
elem = E[m,h,j]
if E[m,h,j]<0
E[m,h,j]=elem+lambda1*elem-.5*lambda2*elem^2
else
E[m,h,j]=elem
end
end
end
end
在这个时候,我得到:
@time dynam_serial(E,gm,gz,beta,Zprob)
> 106.880008 seconds (91.70 M allocations: 203.233 GB, 15.22% gc time)
现在,我尝试使用共享阵列从并行处理中获益。请注意,我重新配置了迭代,因此对于循环,我只有一个,而不是两个。我还使用v=deepcopy(电视)
;否则,v
将作为Array
对象而不是SharedArray
复制:
function dynam_parallel(E,gm,gz,beta,Zprob)
v = SharedArray(Float32,(gm,gz),init = S -> S[Base.localindexes(S)] = myid() )
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1 # arbitrary initial value greater than convcrit
while diff>convcrit
exp_v=v*Zprob'
Tv = SharedArray(Float32,gm,gz,init = S -> S[Base.localindexes(S)] = myid() )
@sync @parallel for hj=1:(gm*gz)
j=cld(hj,gm)
h=mod(hj,gm)
if h==0;h=gm;end;
@async Tv[h,j]=findmax(E[:,h,j] + beta*exp_v[:,j])[1]
end
diff = maxabs(Tv - v)
v=deepcopy(Tv)
end
end
并行版本的定时;使用4核2.5 GHz I7处理器和16GB内存,我得到:
addprocs(3)
@time dynam_parallel(E,gm,gz,beta,Zprob)
> 164.237208 seconds (2.64 M allocations: 201.812 MB, 0.04% gc time)
我在这里做错什么了吗?或者,对于这个特殊的问题,有没有更好的方法在Julia中实现并行处理?我曾考虑过使用分布式阵列,但我很难看到如何将它们应用于当前的问题
更新:
根据@DanGetz和他有用的评论,我转而尝试加快串行处理版本。我能够通过以下方式将性能降低到53.469780秒(67.36 M分配:103.419 GiB,19.12%的gc时间)
:
1) 升级到0.6.0(节省约25秒),其中包括有用的@views
宏
2) 根据Julia Performance Tips:中有关预分配输出的部分,预分配我尝试填充的主阵列(Tv
)。(又节省了大约25秒)
剩下的最大减速似乎来自add_vecs
函数,它将两个较大矩阵的子数组相加。我曾尝试过开发和使用BLAS函数,但未能产生更好的性能
在任何情况下,dynam_serial
的改进代码如下:
function add_vecs(r::Array{Float32},h::Int,j::Int,E::Array{Float32},exp_v::Array{Float32},beta::Float32)
@views r=E[:,h,j] + beta*exp_v[:,j]
return r
end
function dynam_serial(E::Array{Float32},gm::Int,gz::Int,beta::Float32,Zprob::Array{Float32})
v = Array{Float32}(gm,gz)
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
Tv = Array{Float32}(gm,gz)
r = Array{Float32}(gm)
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1 # arbitrary initial value greater than convcrit
while diff>convcrit
exp_v=v*Zprob'
for h=1:gm
for j=1:gz
@views Tv[h,j]=findmax(add_vecs(r,h,j,E,exp_v,beta))[1]
end
end
diff = maximum(abs,Tv - v)
v=copy(Tv)
end
return Tv
end
如果add_vecs
似乎是关键功能,那么为循环编写一个明确的,可以提供更多的优化。以下基准是如何确定的:
function add_vecs!(r::Array{Float32},h::Int,j::Int,E::Array{Float32},
exp_v::Array{Float32},beta::Float32)
@inbounds for i=1:size(E,1)
r[i]=E[i,h,j] + beta*exp_v[i,j]
end
return r
end
更新
要继续优化dynam_serial,我已尝试删除更多分配。结果是:
function add_vecs_and_max!(gm::Int,r::Array{Float64},h::Int,j::Int,E::Array{Float64},
exp_v::Array{Float64},beta::Float64)
@inbounds for i=1:gm
r[i] = E[i,h,j]+beta*exp_v[i,j]
end
return findmax(r)[1]
end
function dynam_serial(E::Array{Float64},gm::Int,gz::Int,
beta::Float64,Zprob::Array{Float64})
v = Array{Float64}(gm,gz)
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
r = Array{Float64}(gm)
exp_v = Array{Float64}(gm,gz)
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1.0 # arbitrary initial value greater than convcrit
while diff>convcrit
A_mul_Bt!(exp_v,v,Zprob)
diff = -Inf
for h=1:gm
for j=1:gz
oldv = v[h,j]
newv = add_vecs_and_max!(gm,r,h,j,E,exp_v,beta)
v[h,j]= newv
diff = max(diff, oldv-newv, newv-oldv)
end
end
end
return v
end
将函数切换为使用Float64应该可以提高速度(因为CPU本身就针对64位字长度进行了优化)。另外,使用变异A_mul_Bt代码>直接保存另一个分配。通过切换阵列v
和Tv
避免复制(…)
这些优化如何提高您的运行时间
第二次更新
更新更新部分中的代码以使用findmax
。此外,将dynam_serial
更改为使用v
而不使用Tv
,因为除了diff
计算之外,无需保存旧版本,该计算现在在循环内部完成。如果添加向量
似乎是关键功能,为
循环编写一个显式的,可以提供更多的优化。以下基准是如何确定的:
function add_vecs!(r::Array{Float32},h::Int,j::Int,E::Array{Float32},
exp_v::Array{Float32},beta::Float32)
@inbounds for i=1:size(E,1)
r[i]=E[i,h,j] + beta*exp_v[i,j]
end
return r
end
更新
要继续优化dynam_serial,我已尝试删除更多分配。结果是:
function add_vecs_and_max!(gm::Int,r::Array{Float64},h::Int,j::Int,E::Array{Float64},
exp_v::Array{Float64},beta::Float64)
@inbounds for i=1:gm
r[i] = E[i,h,j]+beta*exp_v[i,j]
end
return findmax(r)[1]
end
function dynam_serial(E::Array{Float64},gm::Int,gz::Int,
beta::Float64,Zprob::Array{Float64})
v = Array{Float64}(gm,gz)
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
r = Array{Float64}(gm)
exp_v = Array{Float64}(gm,gz)
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1.0 # arbitrary initial value greater than convcrit
while diff>convcrit
A_mul_Bt!(exp_v,v,Zprob)
diff = -Inf
for h=1:gm
for j=1:gz
oldv = v[h,j]
newv = add_vecs_and_max!(gm,r,h,j,E,exp_v,beta)
v[h,j]= newv
diff = max(diff, oldv-newv, newv-oldv)
end
end
end
return v
end
将函数切换为使用Float64应该可以提高速度(因为CPU本身就针对64位字长度进行了优化)。另外,使用变异A_mul_Bt代码>直接保存另一个分配。通过切换阵列v
和Tv
避免复制(…)
这些优化如何提高您的运行时间
第二次更新
更新更新部分中的代码以使用findmax
。另外,将dynam_serial
更改为使用v
而不使用Tv
,因为除了diff
计算之外,无需保存旧版本,该计算现在在循环内部完成。这是我复制和粘贴的代码,由Dan Getz提供。我完全按照运行数组和标量定义的方式包含它们。性能是:39.507005秒(11次分配:486.891kib)
运行@time-dynam\u-serial(E、gm、gz、beta、Zprob)
这是我复制和粘贴的代码,由上面的Dan Getz提供。我完全按照运行数组和标量定义的方式包含它们。性能是:39.507005秒(11次分配:486.891kib)
运行@time-dynam\u-serial(E、gm、gz、beta、Zprob)
也许先优化非并行版本?矩阵有约20000个元素,即160KB,但计时显示它分配了数百万次,大约是矩阵大小的1000倍。应该有很大的改进空间。此外,使用最新的Julia版本和矢量化的点表示法也是一大好处。dynam_serial
返回什么?目前,您可以通过简单地不运行它来优化它,因为它没有任何外部影响,并且不会返回任何HI Dan,谢谢您的评论。我截断了函数只是为了说明问题。最终,它返回矩阵v
(如图所示)以及决策规则(未显示),这些规则稍后将在模拟中使用(也未显示)。我将尽我所能,首先尝试优化非并行版本,并升级到最新版本。还要注意的是,虽然矩阵是~20000个元素,但这20000个元素会在多次迭代中填充,直到收敛。Dan,你的评论非常有用。我继续致力于预分配每个迭代中使用的数组,并将性能提高到60.107114秒
using SpecialFunctions
est_params = [.788,.288,.0034,.1519,.1615,.0041,.0077,.2,0.005,.7196]
r = 0.015
tau = 0.35
rho =est_params[1]
sigma =est_params[2]
delta = 0.15
gamma =est_params[3]
a_capital =est_params[4]
lambda1 =est_params[5]
lambda2 =est_params[6]
s =est_params[7]
theta =est_params[8]
mu =est_params[9]
p_bar_k_ss =est_params[10]
beta = Float32((1+r)^(-1))
sigma_range = 4
gz = 19
gp = 29
gk = 37
lnz=collect(linspace(-sigma_range*sigma,sigma_range*sigma,gz))
z=exp(lnz)
gk_m = fld(gk,2)
# Need to add mu somewhere to k_ss
k_ss = (theta*(1-tau)/(r+delta))^(1/(1-theta))
k=cat(1,map(i->k_ss*((1-delta)^i),collect(1:gk_m)),map(i->k_ss/((1-delta)^i),collect(1:gk_m)))
insert!(k,gk_m+1,k_ss)
sort!(k)
p_bar=p_bar_k_ss*k_ss
p = collect(linspace(-p_bar/2,p_bar,gp))
#Tauchen
N = length(z)
Z = zeros(N,1)
Zprob = zeros(Float32,N,N)
Z[N] = lnz[length(z)]
Z[1] = lnz[1]
zstep = (Z[N] - Z[1]) / (N - 1)
for i=2:(N-1)
Z[i] = Z[1] + zstep * (i - 1)
end
for a = 1 : N
for b = 1 : N
if b == 1
Zprob[a,b] = 0.5*erfc(-((Z[1] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2))
elseif b == N
Zprob[a,b] = 1 - 0.5*erfc(-((Z[N] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
else
Zprob[a,b] = 0.5*erfc(-((Z[b] - mu - rho * Z[a] + zstep / 2) / sigma)/sqrt(2)) -
0.5*erfc(-((Z[b] - mu - rho * Z[a] - zstep / 2) / sigma)/sqrt(2))
end
end
end
# Collecting tauchen results in a 2 element array of linspace and array; [2] gets array
# Zprob=collect(tauchen(gz, rho, sigma, mu, sigma_range))[2]
Zcumprob=zeros(Float32,gz,gz)
# 2 in cumsum! denotes the 2nd dimension, i.e. columns
cumsum!(Zcumprob, Zprob,2)
gm = gk * gp
control=zeros(gm,2)
for i=1:gk
control[(1+gp*(i-1)):(gp*i),1]=fill(k[i],(gp,1))
control[(1+gp*(i-1)):(gp*i),2]=p
end
endog=copy(control)
E=Array(Float32,gm,gm,gz)
for h=1:gm
for m=1:gm
for j=1:gz
# set the nonzero net debt indicator
if endog[h,2]<0
p_ind=1
else
p_ind=0
end
# set the investment indicator
if (control[m,1]-(1-delta)*endog[h,1])!=0
i_ind=1
else
i_ind=0
end
E[m,h,j] = (1-tau)*z[j]*(endog[h,1]^theta) + control[m,2]-endog[h,2]*(1+r*(1-tau)) +
delta*endog[h,1]*tau-(control[m,1]-(1-delta)*endog[h,1]) -
(i_ind*gamma*endog[h,1]+endog[h,1]*(a_capital/2)*(((control[m,1]-(1-delta)*endog[h,1])/endog[h,1])^2)) +
s*endog[h,2]*p_ind
elem = E[m,h,j]
if E[m,h,j]<0
E[m,h,j]=elem+lambda1*elem-.5*lambda2*elem^2
else
E[m,h,j]=elem
end
end
end
end
function add_vecs!(gm::Int,r::Array{Float32},h::Int,j::Int,E::Array{Float32},
exp_v::Array{Float32},beta::Float32)
@inbounds @views for i=1:gm
r[i]=E[i,h,j] + beta*exp_v[i,j]
end
return r
end
function dynam_serial(E::Array{Float32},gm::Int,gz::Int,beta::Float32,Zprob::Array{Float32})
v = Array{Float32}(gm,gz)
fill!(v,E[cld(gm,2),cld(gm,2),cld(gz,2)])
Tv = Array{Float32}(gm,gz)
# Set parameters for the loop
convcrit = 0.0001 # chosen convergence criterion
diff = 1.00000 # arbitrary initial value greater than convcrit
iter=0
exp_v=Array{Float32}(gm,gz)
r=Array{Float32}(gm)
while diff>convcrit
A_mul_Bt!(exp_v,v,Zprob)
for h=1:gm
for j=1:gz
Tv[h,j]=findmax(add_vecs!(gm,r,h,j,E,exp_v,beta))[1]
end
end
diff = maximum(abs,Tv - v)
(v,Tv)=(Tv,v)
end
return v
end