加速Python中的集成函数
我有一个函数,它是某个更大问题的内环。因此,它将被称为数百万次。我试着优化它。但由于这是我的第一个数字项目,我想知道是否有其他方法可以提高速度 赛顿似乎帮不了什么忙。也许numpy已经接近c了。 或者我没有有效地编写cython代码加速Python中的集成函数,python,loops,numpy,optimization,Python,Loops,Numpy,Optimization,我有一个函数,它是某个更大问题的内环。因此,它将被称为数百万次。我试着优化它。但由于这是我的第一个数字项目,我想知道是否有其他方法可以提高速度 赛顿似乎帮不了什么忙。也许numpy已经接近c了。 或者我没有有效地编写cython代码 将numpy导入为np 输入数学 将numexpr作为ne导入 par_mu_rho=0.8 par_alpha_rho=0.7 #“前两个是mus和the的平均值” #“后两个是阿尔法的平均值。” cov_epsilon=[[1,par_mu_rho],[par_
将numpy导入为np
输入数学
将numexpr作为ne导入
par_mu_rho=0.8
par_alpha_rho=0.7
#“前两个是mus和the的平均值”
#“后两个是阿尔法的平均值。”
cov_epsilon=[[1,par_mu_rho],[par_mu_rho,1]]
cov_nu=[[1,par_alpha_rho],[par_alpha_rho,1]]
nrows=10000
np.随机种子(123)
εsim=np.随机多变量正态分布([0,0],covε,nrows)
nu_sim=np.随机多变量正态分布([0,0],cov_nu,nrows)
错误=np.连接((εsim,nu sim),轴=1)
errors=np.exp(错误)
###需要优化的功能
def mktout(平均μα、误差、parγ):
mu10=错误[:,0]*math.exp(平均值[0])
mu11=数学表达式(par_gamma)*mu10#mu与gamma
mu20=错误[:,1]*math.exp(平均值[1])
mu21=数学经验(par_gamma)*mu20
alpha1=错误[:,2]*math.exp(平均值[2])
alpha2=错误[:,3]*math.exp(平均值[3])
j_更大=(mu10>mu20)
#使用其他1=(mu10<1/168)
阈值2=(1+mu10*alpha1)/(168+alpha1)
#useboth1=(mu21>=阈值2)
j_更小=~j_更大
#使用其他2=(mu20<1/168)
阈值3=(1+mu20*alpha2)/(168+alpha2)
#useboth2=(mu11>=阈值3)
案例1=j_大于*(mu10<1/168)
案例2=j_大于*(mu21>=threshold2)
#案例3=j_大于*(1-(使用其他1 |使用两个1))
案例3=j|u大于^(案例1|案例2)
案例4=j_更小*(mu20<1/168)
案例5=j_更小*(mu11>=threshold3)
#案例6=j_更小*(1-(使用其他2 |使用两个2))
案例6=j|u较小^(案例4|案例5)
t0=ne(
“案例1*168+案例2*(168+字母1+字母2)/(1+mu11*字母1+mu21*字母2)+案例3/阈值2+案例4*168+案例5*(168+字母1+字母2)/(1+mu11*字母1+mu21*字母2)+案例6/阈值3”)
#在某些情况下,t1无论如何都是0,所以这里省略了它们。
t1=ne(
“案例2*(t0*阿尔法1*mu11-阿尔法1)+案例3*(t0*阿尔法1*mu10-阿尔法1)+案例5*(t0*阿尔法1*mu11-阿尔法1)”
#t2=(j_更大*使用两个1*(t0*alpha2*mu21-alpha2)+
#j_较小*使用范围2*(t0*alpha2*mu21-alpha2)+
#j_更小*(1-(使用其他2 |使用both2))*(t0*α2*mu20-α2)
# )
t2=168-t0-t1
p12=案例2+案例5
p1=案例3+p12
p2=案例6+p12
返回t1.sum()/10000,t2.sum()/10000,p1.sum()/10000,p2.sum()/10000
timeit mktout([-6,-6,-1,-1],错误,-0.7)
在2.2GHz i7的旧mac上。该函数的运行速度约为200µs
更新:
基于@CodeSurgeor和@GZ0的建议和代码,我决定使用以下代码
def mktout_full(双[:]平均值α,双[:,::1]错误,双参数γ):
cdef:
尺寸
双[4]经验
双经验parγ
双mu10、mu11、mu20、mu21
双字母1,字母2
双阈值2,阈值3
双t0,t1,t2
双t1和,t2和,p1和,p2和,p12和
双c
#计算循环外部的exp
n=错误。形状[0]
exp[0]=cmath.exp(平均μα[0])
exp[1]=cmath.exp(平均μα[1])
exp[2]=cmath.exp(平均μα[2])
exp[3]=cmath.exp(平均μα[3])
exp\u par\u gamma=cmath.exp(par\u gamma)
c=168.0
t1_和=0.0
t2_和=0.0
p1_和=0.0
p2_和=0.0
p12_总和=0.0
对于范围(n)中的i:
mu10=错误[i,0]*exp[0]
#mu11=exp\u par\u gamma*mu10
mu20=错误[i,1]*exp[1]
#mu21=exp\u par\u gamma*mu20
#alpha1=错误[i,2]*exp[2]
#alpha2=错误[i,3]*exp[3]
#j_更大=mu10>mu20
#j_更小=而不是j_更大
如果(mu10>=mu20):
如果(mu10>=1/c):
mu21=exp\u par\u gamma*mu20
alpha1=错误[i,2]*exp[2]
alpha2=错误[i,3]*exp[3]
阈值2=(1+mu10*alpha1)/(c+alpha1)
如果(mu21>=阈值2):
mu11=exp\u par\u gamma*mu10
t0=(c+alpha1+alpha2)/(1+mu11*alpha1+mu21*alpha2)
t1=(t0*alpha1*mu11-alpha1)
t1_和+=t1
t2_sum+=c-t0-t1
p1_总和+=1
p2_总和+=1
p12_总和+=1
其他:
t1_和+=((1/阈值2)*α1*mu10-α1)
p1_总和+=1
其他:
如果(mu20>=1/c):
mu11=exp\u par\u gamma*mu10
alpha1=错误[i,2]*exp[2]
alpha2=错误[i,3]*exp[3]
阈值3=(1+mu20*alpha2)/(c+alpha2)
如果(mu11>=阈值3):
mu21=exp\u par\u gamma*mu20
t0=(c+alpha1+alpha2)/(1+mu11*alpha1+mu21*alpha2)
t1=(t0*alpha1*mu11-alpha1)
t1_和+=t1
t2_sum+=c-t0-t1
p1_总和+=1
p2_总和+=1
p12_总和+=1
其他:
t2_和+=((1/阈值3)*α2*mu20-α2)
p2_总和+=1
返回t1求和/n、t2求和/n、p1求和/n、p2求和/n、p12求和/n
我的原始代码以650µs的速度运行。
mktout
和mktout_if
由code外科医生在大约220µs和120µs下运行。
from libc cimport math as cmath
from libc.stdint cimport *
from libc.stdlib cimport *
def mktout(list mean_mu_alpha, double[:, ::1] errors, double par_gamma):
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger, j_is_smaller
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(<double>mean_mu_alpha[0])
exp[1] = cmath.exp(<double>mean_mu_alpha[1])
exp[2] = cmath.exp(<double>mean_mu_alpha[2])
exp[3] = cmath.exp(<double>mean_mu_alpha[3])
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
case1 = j_is_larger * (mu10 < 1 / c)
case2 = j_is_larger * (mu21 >= threshold2)
case3 = j_is_larger ^ (case1 | case2)
case4 = j_is_smaller * (mu20 < 1 / c)
case5 = j_is_smaller * (mu11 >= threshold3)
case6 = j_is_smaller ^ (case4 | case5)
t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
p12 = case2 + case5
p1 = case3 + p12
p2 = case6 + p12
t1_sum += t1
t2_sum += t2
p1_sum += p1
p2_sum += p2
return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n
from libc cimport math as cmath
from libc.stdint cimport *
from libc.stdlib cimport *
from cython.parallel cimport prange
def mktout(list mean_mu_alpha, double[:, ::1] errors, double par_gamma):
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger, j_is_smaller
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(<double>mean_mu_alpha[0])
exp[1] = cmath.exp(<double>mean_mu_alpha[1])
exp[2] = cmath.exp(<double>mean_mu_alpha[2])
exp[3] = cmath.exp(<double>mean_mu_alpha[3])
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
case1 = j_is_larger * (mu10 < 1 / c)
case2 = j_is_larger * (mu21 >= threshold2)
case3 = j_is_larger ^ (case1 | case2)
case4 = j_is_smaller * (mu20 < 1 / c)
case5 = j_is_smaller * (mu11 >= threshold3)
case6 = j_is_smaller ^ (case4 | case5)
t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
p12 = case2 + case5
p1 = case3 + p12
p2 = case6 + p12
t1_sum += t1
t2_sum += t2
p1_sum += p1
p2_sum += p2
return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n
ctypedef struct Vec4:
double a
double b
double c
double d
def outer_loop(list mean_mu_alpha, double[:, ::1] errors, double par_gamma, size_t n):
cdef:
size_t i
Vec4 mean_vec
Vec4 out
mean_vec.a = <double>(mean_mu_alpha[0])
mean_vec.b = <double>(mean_mu_alpha[1])
mean_vec.c = <double>(mean_mu_alpha[2])
mean_vec.d = <double>(mean_mu_alpha[3])
with nogil:
for i in prange(n):
cy_mktout(&out, &mean_vec, errors, par_gamma)
return out
cdef void cy_mktout(Vec4 *out, Vec4 *mean_mu_alpha, double[:, ::1] errors, double par_gamma) nogil:
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger, j_is_smaller
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(mean_mu_alpha.a)
exp[1] = cmath.exp(mean_mu_alpha.b)
exp[2] = cmath.exp(mean_mu_alpha.c)
exp[3] = cmath.exp(mean_mu_alpha.d)
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
case1 = j_is_larger * (mu10 < 1 / c)
case2 = j_is_larger * (mu21 >= threshold2)
case3 = j_is_larger ^ (case1 | case2)
case4 = j_is_smaller * (mu20 < 1 / c)
case5 = j_is_smaller * (mu11 >= threshold3)
case6 = j_is_smaller ^ (case4 | case5)
t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
p12 = case2 + case5
p1 = case3 + p12
p2 = case6 + p12
t1_sum += t1
t2_sum += t2
p1_sum += p1
p2_sum += p2
out.a = t1_sum/n
out.b = t2_sum/n
out.c = p1_sum/n
out.d = p2_sum/n
from distutils.core import setup
from Cython.Build import cythonize
from distutils.core import Extension
import numpy as np
import os
import shutil
import platform
libraries = {
"Linux": [],
"Windows": [],
}
language = "c"
args = ["-w", "-std=c11", "-O3", "-ffast-math", "-march=native", "-fopenmp"]
link_args = ["-std=c11", "-fopenmp"]
annotate = True
directives = {
"binding": True,
"boundscheck": False,
"wraparound": False,
"initializedcheck": False,
"cdivision": True,
"nonecheck": False,
"language_level": "3",
#"c_string_type": "unicode",
#"c_string_encoding": "utf-8",
}
if __name__ == "__main__":
system = platform.system()
libs = libraries[system]
extensions = []
ext_modules = []
#create extensions
for path, dirs, file_names in os.walk("."):
for file_name in file_names:
if file_name.endswith("pyx"):
ext_path = "{0}/{1}".format(path, file_name)
ext_name = ext_path \
.replace("./", "") \
.replace("/", ".") \
.replace(".pyx", "")
ext = Extension(
name=ext_name,
sources=[ext_path],
libraries=libs,
language=language,
extra_compile_args=args,
extra_link_args=link_args,
include_dirs = [np.get_include()],
)
extensions.append(ext)
#setup all extensions
ext_modules = cythonize(
extensions,
annotate=annotate,
compiler_directives=directives,
)
setup(ext_modules=ext_modules)
"""
#immediately remove build directory
build_dir = "./build"
if os.path.exists(build_dir):
shutil.rmtree(build_dir)
"""
cdef void cy_mktout_if(Vec4 *out, Vec4 *mean_mu_alpha, double[:, ::1] errors, double par_gamma) nogil:
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(mean_mu_alpha.a)
exp[1] = cmath.exp(mean_mu_alpha.b)
exp[2] = cmath.exp(mean_mu_alpha.c)
exp[3] = cmath.exp(mean_mu_alpha.d)
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
if j_is_larger:
case1 = mu10 < 1 / c
case2 = mu21 >= threshold2
case3 = not (case1 | case2)
t0 = case1*c + case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case3 / threshold2
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) + case3 * (t0 * alpha1 * mu10 - alpha1)
t2 = c - t0 - t1
t1_sum += t1
t2_sum += t2
p1_sum += case2 + case3
p2_sum += case2
else:
case4 = mu20 < 1 / c
case5 = mu11 >= threshold3
case6 = not (case4 | case5)
t0 = case4 * c + case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
t1_sum += t1
t2_sum += t2
p1_sum += case5
p2_sum += case5 + case6
out.a = t1_sum/n
out.b = t2_sum/n
out.c = p1_sum/n
out.d = p2_sum/n
outer_loop: 0.5116949229995953 seconds
outer_loop_if: 0.617649456995423 seconds
mktout: 0.9221872320049442 seconds
mktout_if: 1.430276553001022 seconds
python: 10.116664300003322 seconds