Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/304.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 基于pymc3的贝叶斯逻辑回归预测_Python_Theano_Pymc3 - Fatal编程技术网

Python 基于pymc3的贝叶斯逻辑回归预测

Python 基于pymc3的贝叶斯逻辑回归预测,python,theano,pymc3,Python,Theano,Pymc3,我试图使用pymc3执行贝叶斯逻辑回归,但在使用该模型执行预测时遇到了一个问题 数据: AsTensorError: ('Variable type field must be a TensorType.', <Generic>, <theano.gof.type.Generic object at 0x00000216ABB16730>) %matplotlib inline from pathlib import Path import pickle from co

我试图使用pymc3执行贝叶斯逻辑回归,但在使用该模型执行预测时遇到了一个问题

数据:

AsTensorError: ('Variable type field must be a TensorType.', <Generic>, <theano.gof.type.Generic object at 0x00000216ABB16730>)
%matplotlib inline
from pathlib import Path
import pickle
from collections import OrderedDict
import pandas as pd
import numpy as np
from scipy import stats
import multiprocessing
import arviz as az

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_curve, roc_auc_score, confusion_matrix, accuracy_score, f1_score, 
                             precision_recall_curve, balanced_accuracy_score) 
from mlxtend.plotting import plot_confusion_matrix

import theano
import pymc3 as pm
from pymc3.variational.callbacks import CheckParametersConvergence
import statsmodels.formula.api as smf

import arviz as az
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import seaborn as sns
from IPython.display import HTML

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")


# intialise data of lists. 
data = {'BAD':[1,1,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1],
        'LOAN':[1700,1800,2300,2400,2400,2900,2900,2900,2900,
3000,3200,3300,3600,3600,3700,3800,3900],
        'MORTDUE':[30548,28502,102370,34863,98449,103949,104373,7750,61962,104570,
74864,130518,100693,52337,17857,51180,29896],
        'VALUE':[40320,43034,120953,47471,117195,112505,120702,67996,70915,121729,
87266,164317,114743,63989,21144,63459,45960],
        'REASON':['HomeImp','HomeImp','HomeImp','HomeImp','HomeImp',
'HomeImp','HomeImp','HomeImp',
                  'DebtCon','HomeImp','HomeImp','DebtCon','HomeImp','HomeImp',
'HomeImp','HomeImp','HomeImp'],
        'JOB':['Other','Other','Office','Mgr','Office','Office','Office',
'Other','Mgr','Office','ProfExe',
               'Other','Office','Office','Other','Office','Other'],
        'YOJ':[9,11,2,12,4,1,2,16,2,2,7,9,6,20,5,20,11],
        'DEROG':[0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0],
        'DELINQ':[0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0],
        'CLAGE':[101.4660019,88.76602988,90.99253347,70.49108003,
93.81177486,96.10232967,101.5402975,
                 122.2046628,282.8016592,85.8843719,250.6312692,
192.289149,88.47045214,204.2724988,
                 129.7173231,203.7515336,146.1232422],
        'NINQ':[1,0,0,1,0,0,0,2,3,0,0,0,0,0,1,0,0],
        'CLNO':[8,8,13,21,13,13,13,8,37,14,12,33,14,20,9,20,14],
        'DEBTINC':[37.11361356,36.88489409,31.58850318,38.26360073,
29.68182705,30.05113629,29.91585903,
                   36.211348,49.20639579,32.05978327,42.90999735,
35.73055919,29.39354338,20.47091551,
                   26.63434752,20.06704205,24.47888119]
       } 
  
# Create DataFrame 
data = pd.DataFrame(data) 

# datatype defining
data[['BAD', 'LOAN', 'MORTDUE', 'VALUE', 'YOJ', 'DEROG', 'DELINQ', 
'CLAGE', 'NINQ', 'CLNO', 'DEBTINC']] = data[['BAD', 'LOAN', 'MORTDUE', 
'VALUE', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ', 'CLNO', 'DEBTINC' ]].apply(pd.to_numeric) 
data[['REASON', 'JOB']] = data[['REASON', 'JOB']].apply(lambda x: x.astype('category'))
print(data.dtypes) 
data.dropna(axis=0, how='any',inplace=True)    

# test train split
X = data.drop('BAD', axis=1)
y = data.BAD
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)
labels = X_train.columns


# model training (error cause)
X_shared = theano.shared(X_train)

with pm.Model() as logistic_model_pred:
    pm.glm.GLM(x=X_shared,
               y=y_train, 
               labels=labels,
               family=pm.glm.families.Binomial())


# Prediction on test data

X_shared = theano.shared(X_test)

ppc = pm.sample_ppc(pred_trace,
                    model=logistic_model_pred,
                    samples=100)


# AUC
np.mean(ppc['y'], axis=0).shape
y_score = np.mean(ppc['y'], axis=0)
roc_auc_score(y_score=np.mean(ppc['y'], axis=0), 
              y_true=y_test)

pred_scores = dict(y_true=y_test, y_score=y_score)
cols = ['False Positive Rate', 'True Positive Rate', 'threshold']
roc = pd.DataFrame(dict(zip(cols, roc_curve(**pred_scores))))

precision, recall, ts = precision_recall_curve(y_true=y_test, probas_pred=y_score)
pr_curve = pd.DataFrame({'Precision': precision, 'Recall': recall})

f1 = pd.Series({t: f1_score(y_true=y_test, y_pred=y_score>t) for t in ts})
best_threshold = f1.idxmax()




# Alternative solution

with pm.Model() as logistic_model_pred:
    pm.glm.GLM.from_formula('BAD ~ DELINQ + DEROG + DEBTINC + NINQ + 
CLNO + VALUE + MORTDUE + YOJ + LOAN + CLAGE + JOB',
                            data=pd.concat([y_train.reset_index(drop=True), X_train], axis=1),
                            family=pm.glm.families.Binomial())
    pred_trace = pm.sample(tune=1500,
                      draws=1000,
                         chains=4,
                         cores=1,
                      init='adapt_diag')
我的数据集是住房贷款违约数据,样本数据如下:

BAD LOAN MORTDUE VALUE  REASON  JOB     YOJ DEROG   DELINQ  CLAGE     NINQ  CLNO    DEBTINC
1   1700 0548    40320  HomeImp Other   9   0       0       101.466002  1   8      37.113614
1   1800 28502   43034  HomeImp Other   11  0       0       88.766030   0   8      36.884894
0   2300 102370  120953 HomeImp Office  2   0       0       90.992533   0   13     31.588503
问题:

AsTensorError: ('Variable type field must be a TensorType.', <Generic>, <theano.gof.type.Generic object at 0x00000216ABB16730>)
%matplotlib inline
from pathlib import Path
import pickle
from collections import OrderedDict
import pandas as pd
import numpy as np
from scipy import stats
import multiprocessing
import arviz as az

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_curve, roc_auc_score, confusion_matrix, accuracy_score, f1_score, 
                             precision_recall_curve, balanced_accuracy_score) 
from mlxtend.plotting import plot_confusion_matrix

import theano
import pymc3 as pm
from pymc3.variational.callbacks import CheckParametersConvergence
import statsmodels.formula.api as smf

import arviz as az
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import seaborn as sns
from IPython.display import HTML

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")


# intialise data of lists. 
data = {'BAD':[1,1,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1],
        'LOAN':[1700,1800,2300,2400,2400,2900,2900,2900,2900,
3000,3200,3300,3600,3600,3700,3800,3900],
        'MORTDUE':[30548,28502,102370,34863,98449,103949,104373,7750,61962,104570,
74864,130518,100693,52337,17857,51180,29896],
        'VALUE':[40320,43034,120953,47471,117195,112505,120702,67996,70915,121729,
87266,164317,114743,63989,21144,63459,45960],
        'REASON':['HomeImp','HomeImp','HomeImp','HomeImp','HomeImp',
'HomeImp','HomeImp','HomeImp',
                  'DebtCon','HomeImp','HomeImp','DebtCon','HomeImp','HomeImp',
'HomeImp','HomeImp','HomeImp'],
        'JOB':['Other','Other','Office','Mgr','Office','Office','Office',
'Other','Mgr','Office','ProfExe',
               'Other','Office','Office','Other','Office','Other'],
        'YOJ':[9,11,2,12,4,1,2,16,2,2,7,9,6,20,5,20,11],
        'DEROG':[0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0],
        'DELINQ':[0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0],
        'CLAGE':[101.4660019,88.76602988,90.99253347,70.49108003,
93.81177486,96.10232967,101.5402975,
                 122.2046628,282.8016592,85.8843719,250.6312692,
192.289149,88.47045214,204.2724988,
                 129.7173231,203.7515336,146.1232422],
        'NINQ':[1,0,0,1,0,0,0,2,3,0,0,0,0,0,1,0,0],
        'CLNO':[8,8,13,21,13,13,13,8,37,14,12,33,14,20,9,20,14],
        'DEBTINC':[37.11361356,36.88489409,31.58850318,38.26360073,
29.68182705,30.05113629,29.91585903,
                   36.211348,49.20639579,32.05978327,42.90999735,
35.73055919,29.39354338,20.47091551,
                   26.63434752,20.06704205,24.47888119]
       } 
  
# Create DataFrame 
data = pd.DataFrame(data) 

# datatype defining
data[['BAD', 'LOAN', 'MORTDUE', 'VALUE', 'YOJ', 'DEROG', 'DELINQ', 
'CLAGE', 'NINQ', 'CLNO', 'DEBTINC']] = data[['BAD', 'LOAN', 'MORTDUE', 
'VALUE', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ', 'CLNO', 'DEBTINC' ]].apply(pd.to_numeric) 
data[['REASON', 'JOB']] = data[['REASON', 'JOB']].apply(lambda x: x.astype('category'))
print(data.dtypes) 
data.dropna(axis=0, how='any',inplace=True)    

# test train split
X = data.drop('BAD', axis=1)
y = data.BAD
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)
labels = X_train.columns


# model training (error cause)
X_shared = theano.shared(X_train)

with pm.Model() as logistic_model_pred:
    pm.glm.GLM(x=X_shared,
               y=y_train, 
               labels=labels,
               family=pm.glm.families.Binomial())


# Prediction on test data

X_shared = theano.shared(X_test)

ppc = pm.sample_ppc(pred_trace,
                    model=logistic_model_pred,
                    samples=100)


# AUC
np.mean(ppc['y'], axis=0).shape
y_score = np.mean(ppc['y'], axis=0)
roc_auc_score(y_score=np.mean(ppc['y'], axis=0), 
              y_true=y_test)

pred_scores = dict(y_true=y_test, y_score=y_score)
cols = ['False Positive Rate', 'True Positive Rate', 'threshold']
roc = pd.DataFrame(dict(zip(cols, roc_curve(**pred_scores))))

precision, recall, ts = precision_recall_curve(y_true=y_test, probas_pred=y_score)
pr_curve = pd.DataFrame({'Precision': precision, 'Recall': recall})

f1 = pd.Series({t: f1_score(y_true=y_test, y_pred=y_score>t) for t in ts})
best_threshold = f1.idxmax()




# Alternative solution

with pm.Model() as logistic_model_pred:
    pm.glm.GLM.from_formula('BAD ~ DELINQ + DEROG + DEBTINC + NINQ + 
CLNO + VALUE + MORTDUE + YOJ + LOAN + CLAGE + JOB',
                            data=pd.concat([y_train.reset_index(drop=True), X_train], axis=1),
                            family=pm.glm.families.Binomial())
    pred_trace = pm.sample(tune=1500,
                      draws=1000,
                         chains=4,
                         cores=1,
                      init='adapt_diag')
我想对测试数据集执行预测,一种方法是使用共享变量方法:

X_shared = theano.shared(X_train)

with pm.Model() as logistic_model_pred:
    pm.glm.GLM(x=X_shared,
               y=y_train, 
               labels=labels,
               family=pm.glm.families.Binomial())


X_shared.set_value(X_test)
ppc = pm.sample_ppc(pred_trace,
                model=logistic_model_pred,
                samples=100)
但是,使用上述代码(theano共享变量)会导致以下问题:

错误消息:

AsTensorError: ('Variable type field must be a TensorType.', <Generic>, <theano.gof.type.Generic object at 0x00000216ABB16730>)
%matplotlib inline
from pathlib import Path
import pickle
from collections import OrderedDict
import pandas as pd
import numpy as np
from scipy import stats
import multiprocessing
import arviz as az

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_curve, roc_auc_score, confusion_matrix, accuracy_score, f1_score, 
                             precision_recall_curve, balanced_accuracy_score) 
from mlxtend.plotting import plot_confusion_matrix

import theano
import pymc3 as pm
from pymc3.variational.callbacks import CheckParametersConvergence
import statsmodels.formula.api as smf

import arviz as az
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import seaborn as sns
from IPython.display import HTML

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")


# intialise data of lists. 
data = {'BAD':[1,1,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1],
        'LOAN':[1700,1800,2300,2400,2400,2900,2900,2900,2900,
3000,3200,3300,3600,3600,3700,3800,3900],
        'MORTDUE':[30548,28502,102370,34863,98449,103949,104373,7750,61962,104570,
74864,130518,100693,52337,17857,51180,29896],
        'VALUE':[40320,43034,120953,47471,117195,112505,120702,67996,70915,121729,
87266,164317,114743,63989,21144,63459,45960],
        'REASON':['HomeImp','HomeImp','HomeImp','HomeImp','HomeImp',
'HomeImp','HomeImp','HomeImp',
                  'DebtCon','HomeImp','HomeImp','DebtCon','HomeImp','HomeImp',
'HomeImp','HomeImp','HomeImp'],
        'JOB':['Other','Other','Office','Mgr','Office','Office','Office',
'Other','Mgr','Office','ProfExe',
               'Other','Office','Office','Other','Office','Other'],
        'YOJ':[9,11,2,12,4,1,2,16,2,2,7,9,6,20,5,20,11],
        'DEROG':[0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0],
        'DELINQ':[0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0],
        'CLAGE':[101.4660019,88.76602988,90.99253347,70.49108003,
93.81177486,96.10232967,101.5402975,
                 122.2046628,282.8016592,85.8843719,250.6312692,
192.289149,88.47045214,204.2724988,
                 129.7173231,203.7515336,146.1232422],
        'NINQ':[1,0,0,1,0,0,0,2,3,0,0,0,0,0,1,0,0],
        'CLNO':[8,8,13,21,13,13,13,8,37,14,12,33,14,20,9,20,14],
        'DEBTINC':[37.11361356,36.88489409,31.58850318,38.26360073,
29.68182705,30.05113629,29.91585903,
                   36.211348,49.20639579,32.05978327,42.90999735,
35.73055919,29.39354338,20.47091551,
                   26.63434752,20.06704205,24.47888119]
       } 
  
# Create DataFrame 
data = pd.DataFrame(data) 

# datatype defining
data[['BAD', 'LOAN', 'MORTDUE', 'VALUE', 'YOJ', 'DEROG', 'DELINQ', 
'CLAGE', 'NINQ', 'CLNO', 'DEBTINC']] = data[['BAD', 'LOAN', 'MORTDUE', 
'VALUE', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ', 'CLNO', 'DEBTINC' ]].apply(pd.to_numeric) 
data[['REASON', 'JOB']] = data[['REASON', 'JOB']].apply(lambda x: x.astype('category'))
print(data.dtypes) 
data.dropna(axis=0, how='any',inplace=True)    

# test train split
X = data.drop('BAD', axis=1)
y = data.BAD
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)
labels = X_train.columns


# model training (error cause)
X_shared = theano.shared(X_train)

with pm.Model() as logistic_model_pred:
    pm.glm.GLM(x=X_shared,
               y=y_train, 
               labels=labels,
               family=pm.glm.families.Binomial())


# Prediction on test data

X_shared = theano.shared(X_test)

ppc = pm.sample_ppc(pred_trace,
                    model=logistic_model_pred,
                    samples=100)


# AUC
np.mean(ppc['y'], axis=0).shape
y_score = np.mean(ppc['y'], axis=0)
roc_auc_score(y_score=np.mean(ppc['y'], axis=0), 
              y_true=y_test)

pred_scores = dict(y_true=y_test, y_score=y_score)
cols = ['False Positive Rate', 'True Positive Rate', 'threshold']
roc = pd.DataFrame(dict(zip(cols, roc_curve(**pred_scores))))

precision, recall, ts = precision_recall_curve(y_true=y_test, probas_pred=y_score)
pr_curve = pd.DataFrame({'Precision': precision, 'Recall': recall})

f1 = pd.Series({t: f1_score(y_true=y_test, y_pred=y_score>t) for t in ts})
best_threshold = f1.idxmax()




# Alternative solution

with pm.Model() as logistic_model_pred:
    pm.glm.GLM.from_formula('BAD ~ DELINQ + DEROG + DEBTINC + NINQ + 
CLNO + VALUE + MORTDUE + YOJ + LOAN + CLAGE + JOB',
                            data=pd.concat([y_train.reset_index(drop=True), X_train], axis=1),
                            family=pm.glm.families.Binomial())
    pred_trace = pm.sample(tune=1500,
                      draws=1000,
                         chains=4,
                         cores=1,
                      init='adapt_diag')
完整代码:

AsTensorError: ('Variable type field must be a TensorType.', <Generic>, <theano.gof.type.Generic object at 0x00000216ABB16730>)
%matplotlib inline
from pathlib import Path
import pickle
from collections import OrderedDict
import pandas as pd
import numpy as np
from scipy import stats
import multiprocessing
import arviz as az

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_curve, roc_auc_score, confusion_matrix, accuracy_score, f1_score, 
                             precision_recall_curve, balanced_accuracy_score) 
from mlxtend.plotting import plot_confusion_matrix

import theano
import pymc3 as pm
from pymc3.variational.callbacks import CheckParametersConvergence
import statsmodels.formula.api as smf

import arviz as az
import matplotlib.pyplot as plt
import matplotlib.cm as cm

import seaborn as sns
from IPython.display import HTML

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")


# intialise data of lists. 
data = {'BAD':[1,1,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1],
        'LOAN':[1700,1800,2300,2400,2400,2900,2900,2900,2900,
3000,3200,3300,3600,3600,3700,3800,3900],
        'MORTDUE':[30548,28502,102370,34863,98449,103949,104373,7750,61962,104570,
74864,130518,100693,52337,17857,51180,29896],
        'VALUE':[40320,43034,120953,47471,117195,112505,120702,67996,70915,121729,
87266,164317,114743,63989,21144,63459,45960],
        'REASON':['HomeImp','HomeImp','HomeImp','HomeImp','HomeImp',
'HomeImp','HomeImp','HomeImp',
                  'DebtCon','HomeImp','HomeImp','DebtCon','HomeImp','HomeImp',
'HomeImp','HomeImp','HomeImp'],
        'JOB':['Other','Other','Office','Mgr','Office','Office','Office',
'Other','Mgr','Office','ProfExe',
               'Other','Office','Office','Other','Office','Other'],
        'YOJ':[9,11,2,12,4,1,2,16,2,2,7,9,6,20,5,20,11],
        'DEROG':[0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0],
        'DELINQ':[0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0],
        'CLAGE':[101.4660019,88.76602988,90.99253347,70.49108003,
93.81177486,96.10232967,101.5402975,
                 122.2046628,282.8016592,85.8843719,250.6312692,
192.289149,88.47045214,204.2724988,
                 129.7173231,203.7515336,146.1232422],
        'NINQ':[1,0,0,1,0,0,0,2,3,0,0,0,0,0,1,0,0],
        'CLNO':[8,8,13,21,13,13,13,8,37,14,12,33,14,20,9,20,14],
        'DEBTINC':[37.11361356,36.88489409,31.58850318,38.26360073,
29.68182705,30.05113629,29.91585903,
                   36.211348,49.20639579,32.05978327,42.90999735,
35.73055919,29.39354338,20.47091551,
                   26.63434752,20.06704205,24.47888119]
       } 
  
# Create DataFrame 
data = pd.DataFrame(data) 

# datatype defining
data[['BAD', 'LOAN', 'MORTDUE', 'VALUE', 'YOJ', 'DEROG', 'DELINQ', 
'CLAGE', 'NINQ', 'CLNO', 'DEBTINC']] = data[['BAD', 'LOAN', 'MORTDUE', 
'VALUE', 'YOJ', 'DEROG', 'DELINQ', 'CLAGE', 'NINQ', 'CLNO', 'DEBTINC' ]].apply(pd.to_numeric) 
data[['REASON', 'JOB']] = data[['REASON', 'JOB']].apply(lambda x: x.astype('category'))
print(data.dtypes) 
data.dropna(axis=0, how='any',inplace=True)    

# test train split
X = data.drop('BAD', axis=1)
y = data.BAD
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12345)
labels = X_train.columns


# model training (error cause)
X_shared = theano.shared(X_train)

with pm.Model() as logistic_model_pred:
    pm.glm.GLM(x=X_shared,
               y=y_train, 
               labels=labels,
               family=pm.glm.families.Binomial())


# Prediction on test data

X_shared = theano.shared(X_test)

ppc = pm.sample_ppc(pred_trace,
                    model=logistic_model_pred,
                    samples=100)


# AUC
np.mean(ppc['y'], axis=0).shape
y_score = np.mean(ppc['y'], axis=0)
roc_auc_score(y_score=np.mean(ppc['y'], axis=0), 
              y_true=y_test)

pred_scores = dict(y_true=y_test, y_score=y_score)
cols = ['False Positive Rate', 'True Positive Rate', 'threshold']
roc = pd.DataFrame(dict(zip(cols, roc_curve(**pred_scores))))

precision, recall, ts = precision_recall_curve(y_true=y_test, probas_pred=y_score)
pr_curve = pd.DataFrame({'Precision': precision, 'Recall': recall})

f1 = pd.Series({t: f1_score(y_true=y_test, y_pred=y_score>t) for t in ts})
best_threshold = f1.idxmax()




# Alternative solution

with pm.Model() as logistic_model_pred:
    pm.glm.GLM.from_formula('BAD ~ DELINQ + DEROG + DEBTINC + NINQ + 
CLNO + VALUE + MORTDUE + YOJ + LOAN + CLAGE + JOB',
                            data=pd.concat([y_train.reset_index(drop=True), X_train], axis=1),
                            family=pm.glm.families.Binomial())
    pred_trace = pm.sample(tune=1500,
                      draws=1000,
                         chains=4,
                         cores=1,
                      init='adapt_diag')

如果将
#模型训练(错误原因)
#AUC
注释之间的代码替换为以下内容,您应该能够运行该代码并开始获得一些结果:

# model training (error cause)
X_train2 = X_train[X_train.columns[0:3]].values
scaler = preprocessing.StandardScaler()
scaler.fit(X_train2)
X_train2 = scaler.transform(X_train2)

X_shared = theano.shared(X_train2) #theano.shared(X_train)

with pm.Model() as logistic_model_pred:
    pm.glm.GLM(x=X_shared,
               y=y_train.values, 
               labels=labels[0:3],
               family=pm.glm.families.Binomial())
    trace = pm.sample()


# Prediction on test data
X_test2 = scaler.transform(X_test[X_train.columns[0:3]].values)

#X_shared = theano.shared(X_test2)
X_shared.set_value(X_test2)

ppc = pm.sample_ppc(trace,
                    model=logistic_model_pred,
                    samples=100)
# AUC
我做了以下更改:

  • 我已将进入
    theano.shared
    的变量更改为numpy数组
  • 这意味着
    X_train
    中的字符串列需要使用或类似的方法进行转换。我没有这样做,所以我只使用了前3列,碰巧是数字
  • 在运行pymc3之前,我还使用了重新缩放输入
  • 最后,对于后验预测,我使用
    .set\u value
    更改了共享变量的值

采样会产生很多差异,但我认为上面的内容为您提供了设置。

非常感谢您,我可以自己处理一个热编码!