Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/281.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Wilson分数区间的Python实现?_Python_Algorithm_Statistics_Ranking - Fatal编程技术网

Wilson分数区间的Python实现?

Wilson分数区间的Python实现?,python,algorithm,statistics,ranking,Python,Algorithm,Statistics,Ranking,在阅读之后,我很好奇是否有人有一个Python实现的Wilson分数置信区间下限用于Bernoulli参数?Reddit使用Wilson分数区间进行评论排名,可以找到一个解释和Python实现 我认为这是一个错误的wilson调用,因为如果你有1向上0向下,你会得到NaN,因为你不能对负值进行sqrt 在阅读本文中的ruby示例时,可以找到正确的示例: 如果您希望直接从置信区间计算z,并且希望避免安装numpy/scipy,那么可以使用以下代码片段: 导入数学 def binconf(p,n,c

在阅读之后,我很好奇是否有人有一个Python实现的Wilson分数置信区间下限用于Bernoulli参数?

Reddit使用Wilson分数区间进行评论排名,可以找到一个解释和Python实现


我认为这是一个错误的wilson调用,因为如果你有1向上0向下,你会得到NaN,因为你不能对负值进行
sqrt

在阅读本文中的ruby示例时,可以找到正确的示例:


如果您希望直接从置信区间计算z,并且希望避免安装numpy/scipy,那么可以使用以下代码片段:

导入数学
def binconf(p,n,c=0.95):
'''
根据正数和负数计算二项式置信区间
观察到负面事件。使用Wilson分数和近似值进行反演
正态累积密度函数。
参数
----------
p:int
观察到的积极事件数
n:int
观察到的负面事件数
c:可选[0,1]
信心百分比。e、 g.0.95表示95%的置信概率
成功介于两个返回值之间
退换商品
-------
θu低:浮动
置信区间的下界
θ_高:浮动
置信区间上界
'''
p、 n=浮动(p),浮动(n)
N=p+N
如果N==0.0:返回(0.0,1.0)
p=p/N
z=标准CDFI(1-0.5*(1-c))
a1=1.0/(1.0+z*z/N)
a2=p+z*z/(2*N)
a3=z*数学sqrt(p*(1-p)/N+z*z/(4*N*N))
报税表(a1*(a2-a3),a1*(a2+a3))
def erfi(x):
“”“逆误差函数的近似值”“”
a=0.147#神奇!!!
a1=数学日志(1-x*x)
a2=(
2.0/(数学pi*a)
+a1/2.0
)
返回(
符号(x)*
数学sqrt(数学sqrt(a2*a2-a1/a)-a2)
)
def标志(x):
如果x<0:返回-1
如果x==0:返回0
如果x>0:返回1
def normcdfi(p,mu=0.0,sigma2=1.0):
“正态分布的逆CDF”
如果mu==0.0和sigma2==1.0:
返回数学sqrt(2)*erfi(2*p-1)
其他:
返回mu+math.sqrt(sigma2)*normcdfi(p)

要在不进行连续性校正的情况下获得Wilson CI,您可以在
statsmodels.stats.proportion
中使用
proportion\u confint
。要获得具有连续性校正的Wilson CI,可以使用以下代码

# cf. 
# [1] R. G. Newcombe. Two-sided confidence intervals for the single proportion, 1998
# [2] R. G. Newcombe. Interval Estimation for the difference between independent proportions:        comparison of eleven methods, 1998

import numpy as np
from statsmodels.stats.proportion import proportion_confint

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
def propci_wilson_cc(count, nobs, alpha=0.05):
    # get confidence limits for proportion
    # using wilson score method w/ cont correction
    # i.e. Method 4 in Newcombe [1]; 
    # verified via Table 1
    from scipy import stats
    n = nobs
    p = count/n
    q = 1.-p
    z = stats.norm.isf(alpha / 2.)
    z2 = z**2   
    denom = 2*(n+z2)
    num = 2.*n*p+z2-1.-z*np.sqrt(z2-2-1./n+4*p*(n*q+1))    
    ci_l = num/denom
    num = 2.*n*p+z2+1.+z*np.sqrt(z2+2-1./n+4*p*(n*q-1))
    ci_u = num/denom
    if p == 0:
        ci_l = 0.
    elif p == 1:
        ci_u = 1.
    return ci_l, ci_u


def dpropci_wilson_nocc(a,m,b,n,alpha=0.05):
    # get confidence limits for difference in proportions
    #   a/m - b/n
    # using wilson score method WITHOUT cont correction
    # i.e. Method 10 in Newcombe [2]
    # verified via Table II    
    theta = a/m - b/n        
    l1, u1 = proportion_confint(count=a, nobs=m, alpha=0.05, method='wilson')
    l2, u2 = proportion_confint(count=b, nobs=n, alpha=0.05, method='wilson')
    ci_u = theta + np.sqrt((a/m-u1)**2+(b/n-l2)**2)
    ci_l = theta - np.sqrt((a/m-l1)**2+(b/n-u2)**2)     
    return ci_l, ci_u


def dpropci_wilson_cc(a,m,b,n,alpha=0.05):
    # get confidence limits for difference in proportions
    #   a/m - b/n
    # using wilson score method w/ cont correction
    # i.e. Method 11 in Newcombe [2]    
    # verified via Table II  
    theta = a/m - b/n    
    l1, u1 = propci_wilson_cc(count=a, nobs=m, alpha=alpha)
    l2, u2 = propci_wilson_cc(count=b, nobs=n, alpha=alpha)    
    ci_u = theta + np.sqrt((a/m-u1)**2+(b/n-l2)**2)
    ci_l = theta - np.sqrt((a/m-l1)**2+(b/n-u2)**2)     
    return ci_l, ci_u


# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
# single proportion testing 
# these come from Newcombe [1] (Table 1)
a_vec = np.array([81, 15, 0, 1])
m_vec = np.array([263, 148, 20, 29])
for (a,m) in zip(a_vec,m_vec):
    l1, u1 = proportion_confint(count=a, nobs=m, alpha=0.05, method='wilson')
    l2, u2 = propci_wilson_cc(count=a, nobs=m, alpha=0.05)
    print(a,m,l1,u1,'   ',l2,u2)

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
# difference in proportions testing 
# these come from Newcombe [2] (Table II)
a_vec = np.array([56,9,6,5,0,0,10,10],dtype=float)
m_vec = np.array([70,10,7,56,10,10,10,10],dtype=float)
b_vec = np.array([48,3,2,0,0,0,0,0],dtype=float)
n_vec = np.array([80,10,7,29,20,10,20,10],dtype=float)

print('\nWilson without CC')
for (a,m,b,n) in zip(a_vec,m_vec,b_vec,n_vec):
    l, u = dpropci_wilson_nocc(a,m,b,n,alpha=0.05)
    print('{:2.0f}/{:2.0f}-{:2.0f}/{:2.0f} ; {:6.4f} ; {:8.4f}, {:8.4f}'.format(a,m,b,n,a/m-b/n,l,u))

print('\nWilson with CC')
for (a,m,b,n) in zip(a_vec,m_vec,b_vec,n_vec):
    l, u = dpropci_wilson_cc(a,m,b,n,alpha=0.05)
    print('{:2.0f}/{:2.0f}-{:2.0f}/{:2.0f} ; {:6.4f} ; {:8.4f}, {:8.4f}'.format(a,m,b,n,a/m-b/n,l,u))

HTH

公认的解决方案似乎使用了硬编码的z值(最适合性能)

如果您希望从中获得ruby公式的直接python等价物和动态z值(基于置信区间):


如果你只是想发布一个链接,可以在评论中发布。如果您将其作为答案发布,请从内容中提供更多信息和/或拉出代码,以便不是每个人都需要关注该链接,并且即使链接失效,该答案也有价值。应更正此答案,以包括以下修改@Vladtn我刚刚用Gullevek的答案更新了它。如果还有其他问题,请告诉我。我只想补充一点,对于95%的置信区间,z分数应该是1.96,而不是1.6。@Wesley是的,我相信
1.0=85%
也是错误的,我已经更新了答案。。。如果n*p-cap*(1-p-cap)低于某个阈值,比如30-35,这里有一个更精确的值表,我会使用带df:(pos+neg)-2的t分布,而不是正态分布。不过只有我的两分钱
return ((phat + z*z/(2*n) - z * sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n))
# cf. 
# [1] R. G. Newcombe. Two-sided confidence intervals for the single proportion, 1998
# [2] R. G. Newcombe. Interval Estimation for the difference between independent proportions:        comparison of eleven methods, 1998

import numpy as np
from statsmodels.stats.proportion import proportion_confint

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
def propci_wilson_cc(count, nobs, alpha=0.05):
    # get confidence limits for proportion
    # using wilson score method w/ cont correction
    # i.e. Method 4 in Newcombe [1]; 
    # verified via Table 1
    from scipy import stats
    n = nobs
    p = count/n
    q = 1.-p
    z = stats.norm.isf(alpha / 2.)
    z2 = z**2   
    denom = 2*(n+z2)
    num = 2.*n*p+z2-1.-z*np.sqrt(z2-2-1./n+4*p*(n*q+1))    
    ci_l = num/denom
    num = 2.*n*p+z2+1.+z*np.sqrt(z2+2-1./n+4*p*(n*q-1))
    ci_u = num/denom
    if p == 0:
        ci_l = 0.
    elif p == 1:
        ci_u = 1.
    return ci_l, ci_u


def dpropci_wilson_nocc(a,m,b,n,alpha=0.05):
    # get confidence limits for difference in proportions
    #   a/m - b/n
    # using wilson score method WITHOUT cont correction
    # i.e. Method 10 in Newcombe [2]
    # verified via Table II    
    theta = a/m - b/n        
    l1, u1 = proportion_confint(count=a, nobs=m, alpha=0.05, method='wilson')
    l2, u2 = proportion_confint(count=b, nobs=n, alpha=0.05, method='wilson')
    ci_u = theta + np.sqrt((a/m-u1)**2+(b/n-l2)**2)
    ci_l = theta - np.sqrt((a/m-l1)**2+(b/n-u2)**2)     
    return ci_l, ci_u


def dpropci_wilson_cc(a,m,b,n,alpha=0.05):
    # get confidence limits for difference in proportions
    #   a/m - b/n
    # using wilson score method w/ cont correction
    # i.e. Method 11 in Newcombe [2]    
    # verified via Table II  
    theta = a/m - b/n    
    l1, u1 = propci_wilson_cc(count=a, nobs=m, alpha=alpha)
    l2, u2 = propci_wilson_cc(count=b, nobs=n, alpha=alpha)    
    ci_u = theta + np.sqrt((a/m-u1)**2+(b/n-l2)**2)
    ci_l = theta - np.sqrt((a/m-l1)**2+(b/n-u2)**2)     
    return ci_l, ci_u


# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
# single proportion testing 
# these come from Newcombe [1] (Table 1)
a_vec = np.array([81, 15, 0, 1])
m_vec = np.array([263, 148, 20, 29])
for (a,m) in zip(a_vec,m_vec):
    l1, u1 = proportion_confint(count=a, nobs=m, alpha=0.05, method='wilson')
    l2, u2 = propci_wilson_cc(count=a, nobs=m, alpha=0.05)
    print(a,m,l1,u1,'   ',l2,u2)

# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 
# difference in proportions testing 
# these come from Newcombe [2] (Table II)
a_vec = np.array([56,9,6,5,0,0,10,10],dtype=float)
m_vec = np.array([70,10,7,56,10,10,10,10],dtype=float)
b_vec = np.array([48,3,2,0,0,0,0,0],dtype=float)
n_vec = np.array([80,10,7,29,20,10,20,10],dtype=float)

print('\nWilson without CC')
for (a,m,b,n) in zip(a_vec,m_vec,b_vec,n_vec):
    l, u = dpropci_wilson_nocc(a,m,b,n,alpha=0.05)
    print('{:2.0f}/{:2.0f}-{:2.0f}/{:2.0f} ; {:6.4f} ; {:8.4f}, {:8.4f}'.format(a,m,b,n,a/m-b/n,l,u))

print('\nWilson with CC')
for (a,m,b,n) in zip(a_vec,m_vec,b_vec,n_vec):
    l, u = dpropci_wilson_cc(a,m,b,n,alpha=0.05)
    print('{:2.0f}/{:2.0f}-{:2.0f}/{:2.0f} ; {:6.4f} ; {:8.4f}, {:8.4f}'.format(a,m,b,n,a/m-b/n,l,u))
import math

import scipy.stats as st


def ci_lower_bound(pos, n, confidence):
    if n == 0:
        return 0
    z = st.norm.ppf(1 - (1 - confidence) / 2)
    phat = 1.0 * pos / n
    return (phat + z * z / (2 * n) - z * math.sqrt((phat * (1 - phat) + z * z / (4 * n)) / n)) / (1 + z * z / n)