R 如何加快数据模型培训/评估?

R 如何加快数据模型培训/评估?,r,performance,ram,R,Performance,Ram,我使用caret::train从一个包含32000 obs和250个变量的数据集中训练了11个模型。我在Linux机器上使用R 软件速度大大减慢了。我将所有型号存储在一个列表中,并写入磁盘,占用了大约1.3Gb的空间 有没有人有使用R创建/使用RAM磁盘的经验?RAM磁盘会对性能产生重大影响吗 我猜我达到了交换内存的限制。关于如何加快R的速度还有其他建议吗 我想完成预测/性能命令,同时绘制测试ROC,并将11个模型的ROC性能制成表格。此外,我还想为其他几个因变量运行该过程 我相信有80GB的r

我使用caret::train从一个包含32000 obs和250个变量的数据集中训练了11个模型。我在Linux机器上使用R

软件速度大大减慢了。我将所有型号存储在一个列表中,并写入磁盘,占用了大约1.3Gb的空间

有没有人有使用R创建/使用RAM磁盘的经验?RAM磁盘会对性能产生重大影响吗

我猜我达到了交换内存的限制。关于如何加快R的速度还有其他建议吗

我想完成预测/性能命令,同时绘制测试ROC,并将11个模型的ROC性能制成表格。此外,我还想为其他几个因变量运行该过程

我相信有80GB的ram,有7个内核。我在训练模型时使用了6核并行处理,但是森林模型需要几个小时的训练

该代码类似于,只是cor用于在训练前去除相关性高于0.9的自变量。我没有要共享的数据集,但有大约30000个OB和250个自变量,还有大约20个因变量,但我一次只处理一个因变量

伪代码。主要区别在于,模型在计算时保存到磁盘,并检查磁盘以查看模型之前是否已计算。此外,我还启用了多核,这在本代码示例中没有列出。培训耗时约30小时,共6个核心课程。我等待了大约3小时的预测/性能命令,然后回家

caretApproach <- function(df, cName, l=3, m="cv", n=3/4, 
                          ostats="gripComp.txt", oplot="gripComp.png", 
                          pw=700, ph=700, plty=1, secAllowed=5,
                          useParellel=TRUE){

  inTrain <- createDataPartition(df$INCOME, p = .8,
                                 list = FALSE,
                                 times = 1)
  set.seed(1)

  # browser()
  indep <- subset(df, select = setdiff(names(df),c(cName)))
  classes <- subset(df, select = c(cName))
  classes <- unlist(classes)  # this converts to a vector

  d_cor <- as.matrix(cor(indep))
  # I don't remember this code fragment, but cor is used to removed correlated columns of classes

  trainDescr <- indep[inTrain,]
  testDescr <- indep[-inTrain,]
  trainClass <- classes[inTrain]
  testClass <- classes[-inTrain]

set.seed(2)
###1 Recursive partitioning  rpart rpart
rpartFit <- train(trainDescr, trainClass, method='rpart', tuneLength = l,trControl = trainControl(method = m))
###2 Recursive partitioning  ctree2 party
ctreeFit <- train(trainDescr, trainClass, method='ctree2', tuneLength = l,trControl = trainControl(method = m))
###3 Random forests  rf  randomForest
rfFit <- train(trainDescr, trainClass, method='rf', tuneLength = l,trControl = trainControl(method = m))
###4 Random forests  cforest party
cforestFit <- train(trainDescr, trainClass, method='cforest', tuneLength = l,trControl = trainControl(method = m))
###5 Bagging  treebag  ipred
treebagFit <- train(trainDescr, trainClass, method='treebag', tuneLength = l,trControl = trainControl(method = m))
###6 Neural networks  nnet  nnet
nnetFit <- train(trainDescr, trainClass, method='nnet', tuneLength = l,trControl = trainControl(method = m))
###7 Support vector machines  svmRadial  kernlab 
svmRadialFit <- train(trainDescr, trainClass, method='svmRadial', tuneLength = l,trControl = trainControl(method = m))
###8 Support vector machines  svmLinear  kernlab
svmLinearFit <- train(trainDescr, trainClass, method='svmLinear', tuneLength = l,trControl = trainControl(method = m))
###9 k nearest neighbors  knn  caret
knnFit <- train(trainDescr, trainClass, method='knn', tuneLength = l,trControl = trainControl(method = m))
###10 Generalized linear model glm  stats
glmFit <- train(trainDescr, trainClass, method='glm', tuneLength = l,trControl = trainControl(method = m))
###11 Logistic/Multinomial Regression multinom  nnet
# I have an if statement here to check if trainDescr is factor
multinomFit <- train(trainDescr, trainClass, method='multinom', tuneLength = l,trControl = trainControl(method = m))

### models
models <- list(rpart=rpartFit, ctree2=ctreeFit, rf=rfFit, cforest=cforestFit, treebag=treebagFit, nnet = nnetFit, svmRadial = svmRadialFit, svmLinear = svmLinearFit, knn = knnFit, glm = glmFit, multinom=multinomFit)

# It takes about 30 hrs of clock time to get to this point
# At this point, models was written to the disk, and the file size is 1.3 GB
save(models,file='models.RData')

  ### predict values
  predValues <- extractPrediction(models, testX = testDescr, testY = testClass)
  testValues <- subset(predValues, dataType == "Test")

  ### predict probability
  probValues <- extractProb(models, testX = testDescr, testY = testClass)
  testProbs <- subset(probValues, dataType == "Test")

# I waited about 3 hours for the above 4 lines, then went home
# The rest of the code may need some correcting

  ############stats
  ###1 rpart
  rpartPred <- subset(testValues, model == "rpart")
  x <- confusionMatrix(rpartPred$pred, rpartPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "rpart")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x1 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  png(filename = oplot, width = pw, height = ph, units = "px")
  plotColors <-colors()[c(9,17,19,24,27,33,51,62,84,254,552)]
  plot(rocCurve, col=plotColors[1], main="", lty=plty)
  legend('bottomright', c('rpart', 'ctree2', 'rf', 'cforest', 'treebag', 'nnet', 'svmRadial', 'svmLinear', 'knn', 'glm', 'multinom'), pch=c(15), col=plotColors)

  ###2 ctree2
  ctreePred <- subset(testValues, model == "ctree2")
  x=confusionMatrix(ctreePred$pred, ctreePred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "ctree2")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x2 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[2], main="", lty=plty, add=TRUE)

  ###3 rf
  rfPred <- subset(testValues, model == "rf")
  x=confusionMatrix(rfPred$pred, rfPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "rf")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x3 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[3], main="", lty=plty, add=TRUE)

  ###4 cforest
  cforestPred <- subset(testValues, model == "cforest")
  x=confusionMatrix(cforestPred$pred, cforestPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "cforest")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x4 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[4], main="", lty=plty, add=TRUE)

  ###5 treebag
  treebagPred <- subset(testValues, model == "treebag")
  x=confusionMatrix(treebagPred$pred, treebagPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "treebag")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x5 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[5], main="", lty=plty, add=TRUE)

  ###6 nnet
  nnetPred <- subset(testValues, model == "nnet")
  x=confusionMatrix(nnetPred$pred, nnetPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "nnet")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x6 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[6], main="", lty=plty, add=TRUE)

  ###7 svmRadial
  svmRadialPred <- subset(testValues, model == "svmRadial")
  x=confusionMatrix(svmRadialPred$pred, svmRadialPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))
  myProb <- subset(testProbs, model == "svmRadial")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x7 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[7], main="", lty=plty, add=TRUE)

  ###8 svmLinear
  svmLinearPred <- subset(testValues, model == "svmLinear")
  x=confusionMatrix(svmLinearPred$pred, svmLinearPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "svmLinear")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x8 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[8], main="", lty=plty, add=TRUE)

  ###9 knn
  knnPred <- subset(testValues, model == "knn")
  x=confusionMatrix(knnPred$pred, knnPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "knn")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x9 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[9], main="", lty=plty, add=TRUE)

  ###10 glm
  glmPred <- subset(testValues, model == "glm")
  x=confusionMatrix(glmPred$pred, glmPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "glm")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x10 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[10], main="", lty=plty, add=TRUE)

  ###11 multinom
  # Needs if statement to check for factors
  multinomPred <- subset(testValues, model == "multinom")
  x=confusionMatrix(multinomPred$pred, multinomPred$obs)
  tp <- x$table[1,1]
  fn <- x$table[2,1]
  fp <- x$table[1,2]
  tn <- x$table[2,2]
  acc <- (tp+tn)/(tp+fn+fp+tn)
  sens <- tp/(tp+fn)
  spec <- tn/(tn+fp)
  phi <- (tp*tn-fp*fn)/(sqrt(tp+fn)*sqrt(tn+fp)*sqrt(tp+fp)*sqrt(tn+fn))

  myProb <- subset(testProbs, model == "multinom")
  rocrObject <- prediction(myProb$Control, myProb$obs)
  rocCurve <- performance(rocrObject,"tpr","fpr")
  modelAUC <- performance(rocrObject,"auc")@y.values
  x11 <- c(tp, fn, fp, tn, modelAUC, acc, sens, spec, phi)

  plot(rocCurve, col=plotColors[11], main="", lty=plty, add=TRUE)
  dev.off()
  ### output
  b <- c(x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11)
  s <- matrix(b, 11, 9, byrow=T, dimnames=list(c("rpart", "ctree2", "rf", "cforest", "treebag", "nnet", "svmRadial", "svmLinear", "knn", "glm", "multinom"), c("TP", "FN", "FP", "TN", "AUC", "Accuracy", "Sensitivity", "Specificity", "phi")))
  write.table(s, ostats)
  write.table(s)
  return(s)

caretApproach RAM磁盘没有帮助。如果您有足够的RAM,您可以将所有这些模型保存在R中的列表中。如果不这样做,那么将其保存到RAM磁盘将不起作用。如果您想保留模型以备以后使用,则应将其保存到真实磁盘。我们可能会帮助您加快速度,但我们需要有关您正在编写的代码的更多信息。我需要确认可用的RAM,但我切换到特定unix设备的原因是因为它有80 GB的RAM。我在4-5 GB的pc上遇到内存错误。可能是predict/performance命令需要很长时间才能运行,而我只是没有等待足够长的时间。但是我想在返回时计算几个因变量,所以我跟踪模型时间(伪代码中没有显示)。我只有几天的时间来完成这项工作。