Machine learning 如何为两个分类器和两个不同的数据集创建组合ROC曲线_Machine Learning_Roc

Machine learning 如何为两个分类器和两个不同的数据集创建组合ROC曲线

machine-learning

Machine learning 如何为两个分类器和两个不同的数据集创建组合ROC曲线,machine-learning,roc,Machine Learning,Roc,我有一个1127名患者的数据集。我的目标是将每个患者分类为0或1。我有两个不同的分类器，但目的相同——将患者分类为0或1。我对364名患者运行了一个分类器，对763名患者运行了第二个分类器。对于每个分类器\组，我生成ROC曲线。现在，我想合并这些曲线。有人能指导我怎么做吗？我正在考虑计算加权FPR和TPR，但我不确定如何计算。曲线之间的FPR\TPR对数不同（第一条ROC曲线基于312对，第二条ROC曲线基于666对）谢谢导入 import numpy as np from s

我有一个1127名患者的数据集。我的目标是将每个患者分类为0或1。我有两个不同的分类器，但目的相同——将患者分类为0或1。我对364名患者运行了一个分类器，对763名患者运行了第二个分类器。对于每个分类器\组，我生成ROC曲线。现在，我想合并这些曲线。有人能指导我怎么做吗？我正在考虑计算加权FPR和TPR，但我不确定如何计算。曲线之间的FPR\TPR对数不同（第一条ROC曲线基于312对，第二条ROC曲线基于666对）

谢谢

导入

import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

def show_ROCs(scores_list: list, ys_list: list, labels_list:list = None):
    """
    This function plots a couple of ROCs. Corresponding labels are optional.

    Parameters
    ----------
    scores_list : list of array-likes with scorings or predicted probabilities.
    ys_list : list of array-likes with ground true labels.
    labels_list : list of labels to be displayed in plotted graph.

    Returns
    ----------
    None

    """
    if len(scores_list) != len(ys_list):
        raise Exception('len(scores_list) != len(ys_list)')
    fpr_dict = dict()
    tpr_dict = dict()
    for x in range(len(scores_list)):
        fpr_dict[x], tpr_dict[x], _ = roc_curve(ys_list[x], scores_list[x])
    for x in range(len(scores_list)):
        try:
            plot_ROC(fpr_dict[x], tpr_dict[x], str(labels_list[x]) + ' AUC:' + str(round(auc(fpr_dict[x], tpr_dict[x]),3)))
        except:
            plot_ROC(fpr_dict[x], tpr_dict[x], str(x) + ' ' + str(round(auc(fpr_dict[x], tpr_dict[x]),3)))
    plt.show()

def plot_ROC(fpr, tpr, label):
    """
    This function plots a single ROC. Corresponding label is optional.

    Parameters
    ----------
    fpr : array-likes with fpr.
    tpr : array-likes with tpr.
    label : label to be displayed in plotted graph.

    Returns
    ----------
    None

    """
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label=label)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')

数据生成

# simulate first dataset with 364 obs
df1 = \
pd.DataFrame(i for i in range(364))
df1['predict_proba_1'] = np.random.normal(0,1,len(df1))
df1['epsilon'] = np.random.normal(0,1,len(df1))
df1['true'] = (0.7*df1['epsilon'] < df1['predict_proba_1']) * 1
df1 = df1.drop(columns=[0, 'epsilon'])

# simulate second dataset with 763 obs
df2 = \
pd.DataFrame(i for i in range(763))
df2['predict_proba_2'] = np.random.normal(0,1,len(df2))
df2['epsilon'] = np.random.normal(0,1,len(df2))
df2['true'] = (0.7*df2['epsilon'] < df2['predict_proba_2']) * 1
df2 = df2.drop(columns=[0, 'epsilon'])

必要的功能

import numpy as np
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

def show_ROCs(scores_list: list, ys_list: list, labels_list:list = None):
    """
    This function plots a couple of ROCs. Corresponding labels are optional.

    Parameters
    ----------
    scores_list : list of array-likes with scorings or predicted probabilities.
    ys_list : list of array-likes with ground true labels.
    labels_list : list of labels to be displayed in plotted graph.

    Returns
    ----------
    None

    """
    if len(scores_list) != len(ys_list):
        raise Exception('len(scores_list) != len(ys_list)')
    fpr_dict = dict()
    tpr_dict = dict()
    for x in range(len(scores_list)):
        fpr_dict[x], tpr_dict[x], _ = roc_curve(ys_list[x], scores_list[x])
    for x in range(len(scores_list)):
        try:
            plot_ROC(fpr_dict[x], tpr_dict[x], str(labels_list[x]) + ' AUC:' + str(round(auc(fpr_dict[x], tpr_dict[x]),3)))
        except:
            plot_ROC(fpr_dict[x], tpr_dict[x], str(x) + ' ' + str(round(auc(fpr_dict[x], tpr_dict[x]),3)))
    plt.show()

def plot_ROC(fpr, tpr, label):
    """
    This function plots a single ROC. Corresponding label is optional.

    Parameters
    ----------
    fpr : array-likes with fpr.
    tpr : array-likes with tpr.
    label : label to be displayed in plotted graph.

    Returns
    ----------
    None

    """
    plt.figure(1)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label=label)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')

绘图

show_ROCs(
    [df1['predict_proba_1'], df2['predict_proba_2']],
    [df1['true'], df2['true']],
    ['df1 with {} obs'.format(len(df1)), 'df2 with {} obs'.format(len(df2))]
)