R 了解xgboost交叉验证和AUC输出结果_R_Cross Validation_Xgboost

R 了解xgboost交叉验证和AUC输出结果

R 了解xgboost交叉验证和AUC输出结果,r,cross-validation,xgboost,R,Cross Validation,Xgboost,我有以下XGBoost C.V.型号 xgboostModelCV <- xgb.cv(data = dtrain, nrounds = 20, nfold = 3, metrics = "auc", verbose = TRUE,

我有以下XGBoost C.V.型号

xgboostModelCV <- xgb.cv(data =  dtrain, 
                             nrounds = 20, 
                             nfold = 3, 
                             metrics = "auc", 
                             verbose = TRUE, 
                             "eval_metric" = "auc",
                             "objective" = "binary:logistic", 
                             "max.depth" = 6, 
                             "eta" = 0.01,                               
                             "subsample" = 0.5, 
                             "colsample_bytree" = 1,
                             print_every_n = 1, 
                             "min_child_weight" = 1,
                             booster = "gbtree",
                             early_stopping_rounds = 10,
                             watchlist = watchlist,
                             seed = 1234)

但是，我设置了

nrounds=20

但交叉验证

nfolds

=3，那么我应该输出60个结果，而不是20个

或者上述输出是否如列名所示，每轮AUC的平均分数

因此，对于训练集，在

nround=1

时，

train\u auc\u mean

是结果

0.8852290

，这将是3个交叉验证

nFold

的平均值

所以如果我绘制这些AUC分数，那么我将绘制3倍交叉验证的平均AUC分数

只是想确保一切都清楚。

输出是折叠

auc

的平均值，这是正确的。但是，如果您希望提取最佳/最后一次迭代的单个折叠auc，您可以按照以下步骤进行：

使用

mlbench

library(xgboost)
library(tidyverse)
library(mlbench)

data(Sonar)

xgb.train.data <- xgb.DMatrix(as.matrix(Sonar[,1:60]), label = as.numeric(Sonar$Class)-1)
param <- list(objective = "binary:logistic")

现在检查折叠并将预测与真实标签和相应索引连接起来：

z <- lapply(model.cv$folds, function(x){
  pred <- model.cv$pred[x]
  true <- (as.numeric(Sonar$Class)-1)[x]
  index <- x
  out <- data.frame(pred, true, index)
  out
})

当然，您可以做更多的事情，比如为每个折叠绘制auc曲线等等

model.cv <- xgb.cv(param = param,
                   data = xgb.train.data,
                   nrounds = 50,
                   early_stopping_rounds = 10,
                   nfold = 3,
                   prediction = TRUE,
                   eval_metric = "auc")

z <- lapply(model.cv$folds, function(x){
  pred <- model.cv$pred[x]
  true <- (as.numeric(Sonar$Class)-1)[x]
  index <- x
  out <- data.frame(pred, true, index)
  out
})

names(z) <- paste("folds", 1:3, sep = "_")

z %>%
  bind_rows(.id = "id") %>%
  group_by(id) %>%
  summarise(auroc = roc(true, pred) %>%
           auc())
#output
# A tibble: 3 x 2
  id      auroc
  <chr>   <dbl>
1 folds_1 0.944
2 folds_2 0.900
3 folds_3 0.899

z %>%
  bind_rows(.id = "id") %>%
  group_by(id) %>%
  summarise(auroc = roc(true, pred) %>%
           auc()) %>%
  pull(auroc) %>%
  mean
#output
[1] 0.9143798

model.cv$evaluation_log[model.cv$best_iteration,]
#output
   iter train_auc_mean train_auc_std test_auc_mean test_auc_std
1:   48              1             0       0.91438   0.02092817