Python Statsr模型中的泊松回归
给定一些随机生成的数据Python Statsr模型中的泊松回归,python,plot,machine-learning,statsmodels,Python,Plot,Machine Learning,Statsmodels,给定一些随机生成的数据 2列 50行和 0-100之间的整数范围 使用R,可以获得泊松glm和诊断图,如下所示: > col=2 > row=50 > range=0:100 > df <- data.frame(replicate(col,sample(range,row,rep=TRUE))) > model <- glm(X2 ~ X1, data = df, family = poisson) > glm.diag.plots(mode
- 2列
- 50行和
- 0-100之间的整数范围
> col=2
> row=50
> range=0:100
> df <- data.frame(replicate(col,sample(range,row,rep=TRUE)))
> model <- glm(X2 ~ X1, data = df, family = poisson)
> glm.diag.plots(model)
并用Python绘制诊断图:
model_fitted_y = results.fittedvalues # fitted values (need a constant term for intercept)
model_residuals = results.resid # model residuals
model_abs_resid = np.abs(model_residuals) # absolute residuals
plot_lm_1 = plt.figure(1)
plot_lm_1.set_figheight(8)
plot_lm_1.set_figwidth(12)
plot_lm_1.axes[0] = sns.residplot(model_fitted_y, 'X2', data=df, lowess=True, scatter_kws={'alpha': 0.5}, line_kws={'color': 'red', 'lw': 1, 'alpha': 0.8})
plot_lm_1.axes[0].set_xlabel('Line Predictor')
plot_lm_1.axes[0].set_ylabel('Residuals')
plt.show()
但当我试图得到库克的统计数据时
# cook's distance, from statsmodels internals
model_cooks = results.get_influence().cooks_distance[0]
它抛出了一个错误,说:
AttributeError Traceback (most recent call last)
<ipython-input-66-0f2bedfa1741> in <module>()
4 model_residuals = results.resid
5 # normalized residuals
----> 6 model_norm_residuals = results.get_influence().resid_studentized_internal
7 # absolute squared normalized residuals
8 model_norm_residuals_abs_sqrt = np.sqrt(np.abs(model_norm_residuals))
/opt/conda/lib/python3.6/site-packages/statsmodels/base/wrapper.py in __getattribute__(self, attr)
33 pass
34
---> 35 obj = getattr(results, attr)
36 data = results.model.data
37 how = self._wrap_attrs.get(attr)
AttributeError: 'GEEResults' object has no attribute 'get_influence'
AttributeError回溯(最近一次调用)
在()
4模型_残差=结果。残差
5#归一化残差
---->6模型标准残差=结果。获取影响()。残差学生化
7#绝对平方归一化残差
8模型(范数)残差(abs)sqrt=np.sqrt(np.abs(模型)残差)
/opt/conda/lib/python3.6/site-packages/statsmodels/base/wrapper.py in_u__getattribute_u_(self,attr)
33通行证
34
--->35 obj=获取属性(结果,属性)
36数据=结果.model.data
37 how=self.\u wrap\u attrs.get(attr)
AttributeError:“GEEResults”对象没有属性“get\u influence”
是否有一种方法可以像在R中那样在Python中绘制出所有4个诊断图?
如何在Python中使用
statsmodels
检索拟合模型结果的cook统计信息?广义估计方程API应给出与R的GLM模型估计不同的结果。要在statsmodels中获得类似的估计,您需要使用以下内容:
import pandas as pd
import statsmodels.api as sm
# Read data generated in R using pandas or something similar
df = pd.read_csv(...) # file name goes here
# Add a column of ones for the intercept to create input X
X = np.column_stack( (np.ones((df.shape[0], 1)), df.X1) )
# Relabel dependent variable as y (standard notation)
y = df.X2
# Fit GLM in statsmodels using Poisson link function
sm.GLM(y, X, family = Poisson()).fit().summary()
编辑——下面是关于如何在泊松回归中获得库克距离的其余答案。这是我根据R中生成的一些数据编写的脚本。我将我的值与R中使用cooks.distance函数计算的值以及匹配的值进行了比较
from __future__ import division, print_function
import numpy as np
import pandas as pd
import statsmodels.api as sm
PATH = '/Users/robertmilletich/test_reg.csv'
def _weight_matrix(fitted_model):
"""Calculates weight matrix in Poisson regression
Parameters
----------
fitted_model : statsmodel object
Fitted Poisson model
Returns
-------
W : 2d array-like
Diagonal weight matrix in Poisson regression
"""
return np.diag(fitted_model.fittedvalues)
def _hessian(X, W):
"""Hessian matrix calculated as -X'*W*X
Parameters
----------
X : 2d array-like
Matrix of covariates
W : 2d array-like
Weight matrix
Returns
-------
hessian : 2d array-like
Hessian matrix
"""
return -np.dot(X.T, np.dot(W, X))
def _hat_matrix(X, W):
"""Calculate hat matrix = W^(1/2) * X * (X'*W*X)^(-1) * X'*W^(1/2)
Parameters
----------
X : 2d array-like
Matrix of covariates
W : 2d array-like
Diagonal weight matrix
Returns
-------
hat : 2d array-like
Hat matrix
"""
# W^(1/2)
Wsqrt = W**(0.5)
# (X'*W*X)^(-1)
XtWX = -_hessian(X = X, W = W)
XtWX_inv = np.linalg.inv(XtWX)
# W^(1/2)*X
WsqrtX = np.dot(Wsqrt, X)
# X'*W^(1/2)
XtWsqrt = np.dot(X.T, Wsqrt)
return np.dot(WsqrtX, np.dot(XtWX_inv, XtWsqrt))
def main():
# Load data and separate into X and y
df = pd.read_csv(PATH)
X = np.column_stack( (np.ones((df.shape[0], 1)), df.X1 ) )
y = df.X2
# Fit model
model = sm.GLM(y, X, family=sm.families.Poisson()).fit()
# Weight matrix
W = _weight_matrix(model)
# Hat matrix
H = _hat_matrix(X, W)
hii = np.diag(H) # Diagonal values of hat matrix
# Pearson residuals
r = model.resid_pearson
# Cook's distance (formula used by R = (res/(1 - hat))^2 * hat/(dispersion * p))
# Note: dispersion is 1 since we aren't modeling overdispersion
cooks_d = (r/(1 - hii))**2 * hii/(1*2)
if __name__ == "__main__":
main()
离群值和影响度量仅适用于OLS,也可能适用于WLS。(使用一些GLM残差可能不难,但需要针对R或Stata进行单元测试。GEE可能更难。)从某些方面来说,R确实是王者。虽然Python的代码比R少且短,但在后一种语言中,许多工作都是通过少量命令完成的。我想念R的命令;)
from __future__ import division, print_function
import numpy as np
import pandas as pd
import statsmodels.api as sm
PATH = '/Users/robertmilletich/test_reg.csv'
def _weight_matrix(fitted_model):
"""Calculates weight matrix in Poisson regression
Parameters
----------
fitted_model : statsmodel object
Fitted Poisson model
Returns
-------
W : 2d array-like
Diagonal weight matrix in Poisson regression
"""
return np.diag(fitted_model.fittedvalues)
def _hessian(X, W):
"""Hessian matrix calculated as -X'*W*X
Parameters
----------
X : 2d array-like
Matrix of covariates
W : 2d array-like
Weight matrix
Returns
-------
hessian : 2d array-like
Hessian matrix
"""
return -np.dot(X.T, np.dot(W, X))
def _hat_matrix(X, W):
"""Calculate hat matrix = W^(1/2) * X * (X'*W*X)^(-1) * X'*W^(1/2)
Parameters
----------
X : 2d array-like
Matrix of covariates
W : 2d array-like
Diagonal weight matrix
Returns
-------
hat : 2d array-like
Hat matrix
"""
# W^(1/2)
Wsqrt = W**(0.5)
# (X'*W*X)^(-1)
XtWX = -_hessian(X = X, W = W)
XtWX_inv = np.linalg.inv(XtWX)
# W^(1/2)*X
WsqrtX = np.dot(Wsqrt, X)
# X'*W^(1/2)
XtWsqrt = np.dot(X.T, Wsqrt)
return np.dot(WsqrtX, np.dot(XtWX_inv, XtWsqrt))
def main():
# Load data and separate into X and y
df = pd.read_csv(PATH)
X = np.column_stack( (np.ones((df.shape[0], 1)), df.X1 ) )
y = df.X2
# Fit model
model = sm.GLM(y, X, family=sm.families.Poisson()).fit()
# Weight matrix
W = _weight_matrix(model)
# Hat matrix
H = _hat_matrix(X, W)
hii = np.diag(H) # Diagonal values of hat matrix
# Pearson residuals
r = model.resid_pearson
# Cook's distance (formula used by R = (res/(1 - hat))^2 * hat/(dispersion * p))
# Note: dispersion is 1 since we aren't modeling overdispersion
cooks_d = (r/(1 - hii))**2 * hii/(1*2)
if __name__ == "__main__":
main()