r-calibration()函数如何计算观察到的偶数百分比

r-calibration()函数如何计算观察到的偶数百分比,r,random-forest,R,Random Forest,我正试图通过《应用预测建模》(max kuhn)一书中的示例进行研究。这是创建校准曲线的一个示例。 我有点理解这条曲线的意义,那就是看实际事件的比例是否与预测事件相似。但我很难理解输出的百分比列是如何计算的。 代码如下: library(AppliedPredictiveModeling) set.seed(975) simulatedTrain <- quadBoundaryFunc(500) simulatedTest <- quadBoundaryFunc(1000) #

我正试图通过《应用预测建模》(max kuhn)一书中的示例进行研究。这是创建校准曲线的一个示例。
我有点理解这条曲线的意义,那就是看实际事件的比例是否与预测事件相似。但我很难理解输出的百分比列是如何计算的。
代码如下:

library(AppliedPredictiveModeling)
set.seed(975)
simulatedTrain <- quadBoundaryFunc(500)
simulatedTest <- quadBoundaryFunc(1000)


# Random forest 

library(randomForest)
rfModel <- randomForest(class ~ X1 + X2,
                        data = simulatedTrain,
                        ntree = 2000)


rfTestPred <- predict(rfModel, simulatedTest, type = "prob")

simulatedTest$RFprob <- rfTestPred[,"Class1"]
simulatedTest$RFclass <- predict(rfModel, simulatedTest)

library(caret)

# Calibrating probabilities
calCurve <- calibration(x = class ~ RFprob, data = simulatedTest)
calCurve$data




 calibModelVar            bin  Percent     Lower     Upper Count  midpoint
1         RFprob     [0,0.0909]  4.00000  2.203804  6.620306    14  4.545455
2         RFprob (0.0909,0.182] 20.00000 11.648215 30.832609    15 13.636364
3         RFprob  (0.182,0.273] 33.33333 20.395974 48.410832    16 22.727273
4         RFprob  (0.273,0.364] 37.20930 22.975170 53.274905    16 31.818182
5         RFprob  (0.364,0.455] 35.71429 18.640666 55.934969    10 40.909091
6         RFprob  (0.455,0.545] 53.19149 38.077789 67.888473    25 50.000000
7         RFprob  (0.545,0.636] 65.71429 47.789002 80.867590    23 59.090909
8         RFprob  (0.636,0.727] 72.50000 56.111709 85.399101    29 68.181818
9         RFprob  (0.727,0.818] 83.33333 67.188407 93.627987    30 77.272727
10        RFprob  (0.818,0.909] 95.83333 85.745903 99.491353    46 86.363636
11        RFprob      (0.909,1] 94.00000 90.296922 96.603304   235 95.454545
当我绘制图表时

xyplot(calCurve, auto.key = list(columns =2))
在X轴上,我知道它是箱子的中点,即
中点列。y轴是
百分比
列。 但是,
百分比
列是如何计算的


校准
中,
百分比的计算如下。
首先,将预测的概率分成11个等距间隔

simulatedTest$bin <- cut(simulatedTest$RFprob, 
                         breaks=seq(0,1,length.out=12),
                         include.lowest=T)
table(simulatedTest$bin)

    [0,0.0909] (0.0909,0.182]  (0.182,0.273]  (0.273,0.364]  (0.364,0.455] 
           350             75             48             43             28 
 (0.455,0.545]  (0.545,0.636]  (0.636,0.727]  (0.727,0.818]  (0.818,0.909] 
            47             35             40             36             48 
     (0.909,1] 
           250
Percent
列包含
tbl
的行比例:

round(prop.table(tbl,1)*100,2)

                    Class1    Class2
  [0,0.0909]      4.000000 96.000000
  (0.0909,0.182] 20.000000 80.000000
  (0.182,0.273]  33.333333 66.666667
  (0.273,0.364]  37.209302 62.790698
  (0.364,0.455]  35.714286 64.285714
  (0.455,0.545]  53.191489 46.808511
  (0.545,0.636]  65.714286 34.285714
  (0.636,0.727]  72.500000 27.500000
  (0.727,0.818]  83.333333 16.666667
  (0.818,0.909]  95.833333  4.166667
  (0.909,1]      94.000000  6.000000
calibration
使用
binom.test
计算这些比例的置信区间:

t(apply(tbl, 1, function(x) { 
   bintst <- binom.test(x=x[1], n=sum(x))  
   round(100*c(bintst$estimate,bintst$conf.int),6)
   }))

                 probability of success                    
  [0,0.0909]                    4.00000  2.203804  6.620306
  (0.0909,0.182]               20.00000 11.648215 30.832609
  (0.182,0.273]                33.33333 20.395974 48.410832
  (0.273,0.364]                37.20930 22.975170 53.274905
  (0.364,0.455]                35.71429 18.640666 55.934969
  (0.455,0.545]                53.19149 38.077789 67.888473
  (0.545,0.636]                65.71429 47.789002 80.867590
  (0.636,0.727]                72.50000 56.111709 85.399101
  (0.727,0.818]                83.33333 67.188407 93.627987
  (0.818,0.909]                95.83333 85.745903 99.491353
  (0.909,1]                    94.00000 90.296922 96.603304
t(应用(tbl,1,函数(x){
宾斯特
round(prop.table(tbl,1)*100,2)

                    Class1    Class2
  [0,0.0909]      4.000000 96.000000
  (0.0909,0.182] 20.000000 80.000000
  (0.182,0.273]  33.333333 66.666667
  (0.273,0.364]  37.209302 62.790698
  (0.364,0.455]  35.714286 64.285714
  (0.455,0.545]  53.191489 46.808511
  (0.545,0.636]  65.714286 34.285714
  (0.636,0.727]  72.500000 27.500000
  (0.727,0.818]  83.333333 16.666667
  (0.818,0.909]  95.833333  4.166667
  (0.909,1]      94.000000  6.000000
t(apply(tbl, 1, function(x) { 
   bintst <- binom.test(x=x[1], n=sum(x))  
   round(100*c(bintst$estimate,bintst$conf.int),6)
   }))

                 probability of success                    
  [0,0.0909]                    4.00000  2.203804  6.620306
  (0.0909,0.182]               20.00000 11.648215 30.832609
  (0.182,0.273]                33.33333 20.395974 48.410832
  (0.273,0.364]                37.20930 22.975170 53.274905
  (0.364,0.455]                35.71429 18.640666 55.934969
  (0.455,0.545]                53.19149 38.077789 67.888473
  (0.545,0.636]                65.71429 47.789002 80.867590
  (0.636,0.727]                72.50000 56.111709 85.399101
  (0.727,0.818]                83.33333 67.188407 93.627987
  (0.818,0.909]                95.83333 85.745903 99.491353
  (0.909,1]                    94.00000 90.296922 96.603304