randomForest、randomForestSRC或cforest中单个树的可变重要性?
我试图在R中找到一种方法来计算随机林或条件随机林中单个树的变量重要性randomForest、randomForestSRC或cforest中单个树的可变重要性?,r,tree,random-forest,party,ensemble-learning,R,Tree,Random Forest,Party,Ensemble Learning,我试图在R中找到一种方法来计算随机林或条件随机林中单个树的变量重要性 一个好的起点是rpart:::importance命令,它计算rpart树的变量重要性度量: > library(rpart) > rp <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis) > rpart:::importance(rp) Start Age Number 8.198442 3.101801 1.5
一个好的起点是
rpart:::importance
命令,它计算rpart
树的变量重要性度量:
> library(rpart)
> rp <- rpart(Kyphosis ~ Age + Number + Start, data = kyphosis)
> rpart:::importance(rp)
Start Age Number
8.198442 3.101801 1.521863
解决方案是使用as.rpart
命令将tree1
强制到rpart
对象。不幸的是,我不知道任何R包中都有此命令。使用
party
包,我发现了类似的问题。varimp
命令可用于cforest
对象,而不是单个树
library(party)
cf <- cforest(Kyphosis ~ Age + Number + Start, data = kyphosis)
ct <- party:::prettytree(cf@ensemble[[1]], names(cf@data@get("input")))
tree2 <- new("BinaryTree")
tree2@tree <- ct
tree2@data <- cf@data
tree2@responses <- cf@responses
tree2@weights <- cf@initweights
varimp(tree2)
Error in varimp(tree2) :
no slot of name "initweights" for this object of class "BinaryTree"
library(party)
cf根据@Alex的建议,我参加了聚会:::varimp
。此命令计算cforest
的标准(平均降低精度)和条件变量重要性(VI),可以轻松修改以计算森林中每棵树的VI
set.seed(12345)
y <- cforest(score ~ ., data = readingSkills,
control = cforest_unbiased(mtry = 2, ntree = 10))
varimp_ctrees <- function (object, mincriterion = 0, conditional = FALSE,
threshold = 0.2, nperm = 1, OOB = TRUE, pre1.0_0 = conditional) {
response <- object@responses
if (length(response@variables) == 1 && inherits(response@variables[[1]],
"Surv"))
return(varimpsurv(object, mincriterion, conditional,
threshold, nperm, OOB, pre1.0_0))
input <- object@data@get("input")
xnames <- colnames(input)
inp <- initVariableFrame(input, trafo = NULL)
y <- object@responses@variables[[1]]
if (length(response@variables) != 1)
stop("cannot compute variable importance measure for multivariate response")
if (conditional || pre1.0_0) {
if (!all(complete.cases(inp@variables)))
stop("cannot compute variable importance measure with missing values")
}
CLASS <- all(response@is_nominal)
ORDERED <- all(response@is_ordinal)
if (CLASS) {
error <- function(x, oob) mean((levels(y)[sapply(x, which.max)] !=
y)[oob])
} else {
if (ORDERED) {
error <- function(x, oob) mean((sapply(x, which.max) !=
y)[oob])
} else {
error <- function(x, oob) mean((unlist(x) - y)[oob]^2)
}
}
w <- object@initweights
if (max(abs(w - 1)) > sqrt(.Machine$double.eps))
warning(sQuote("varimp"), " with non-unity weights might give misleading results")
perror <- matrix(0, nrow = nperm * length(object@ensemble),
ncol = length(xnames))
colnames(perror) <- xnames
for (b in 1:length(object@ensemble)) {
tree <- object@ensemble[[b]]
if (OOB) {
oob <- object@weights[[b]] == 0
} else {
oob <- rep(TRUE, length(y))
}
p <- .Call("R_predict", tree, inp, mincriterion, -1L,
PACKAGE = "party")
eoob <- error(p, oob)
for (j in unique(party:::varIDs(tree))) {
for (per in 1:nperm) {
if (conditional || pre1.0_0) {
tmp <- inp
ccl <- create_cond_list(conditional, threshold,
xnames[j], input)
if (is.null(ccl)) {
perm <- sample(which(oob))
} else {
perm <- conditional_perm(ccl, xnames, input,
tree, oob)
}
tmp@variables[[j]][which(oob)] <- tmp@variables[[j]][perm]
p <- .Call("R_predict", tree, tmp, mincriterion,
-1L, PACKAGE = "party")
} else {
p <- .Call("R_predict", tree, inp, mincriterion,
as.integer(j), PACKAGE = "party")
}
perror[(per + (b - 1) * nperm), j] <- (error(p,
oob) - eoob)
}
}
}
perror <- as.data.frame(perror)
return(list(MeanDecreaseAccuracy = colMeans(perror), VIMcTrees=perror))
}
我认为您必须手动编写函数。注意tree1
可能属于data.frame类,但它不是典型的数据帧。它为您提供树的分类规则rpart:::重要性
和random森林:::重要性
的代码都很容易访问
set.seed(12345)
y <- cforest(score ~ ., data = readingSkills,
control = cforest_unbiased(mtry = 2, ntree = 10))
varimp_ctrees <- function (object, mincriterion = 0, conditional = FALSE,
threshold = 0.2, nperm = 1, OOB = TRUE, pre1.0_0 = conditional) {
response <- object@responses
if (length(response@variables) == 1 && inherits(response@variables[[1]],
"Surv"))
return(varimpsurv(object, mincriterion, conditional,
threshold, nperm, OOB, pre1.0_0))
input <- object@data@get("input")
xnames <- colnames(input)
inp <- initVariableFrame(input, trafo = NULL)
y <- object@responses@variables[[1]]
if (length(response@variables) != 1)
stop("cannot compute variable importance measure for multivariate response")
if (conditional || pre1.0_0) {
if (!all(complete.cases(inp@variables)))
stop("cannot compute variable importance measure with missing values")
}
CLASS <- all(response@is_nominal)
ORDERED <- all(response@is_ordinal)
if (CLASS) {
error <- function(x, oob) mean((levels(y)[sapply(x, which.max)] !=
y)[oob])
} else {
if (ORDERED) {
error <- function(x, oob) mean((sapply(x, which.max) !=
y)[oob])
} else {
error <- function(x, oob) mean((unlist(x) - y)[oob]^2)
}
}
w <- object@initweights
if (max(abs(w - 1)) > sqrt(.Machine$double.eps))
warning(sQuote("varimp"), " with non-unity weights might give misleading results")
perror <- matrix(0, nrow = nperm * length(object@ensemble),
ncol = length(xnames))
colnames(perror) <- xnames
for (b in 1:length(object@ensemble)) {
tree <- object@ensemble[[b]]
if (OOB) {
oob <- object@weights[[b]] == 0
} else {
oob <- rep(TRUE, length(y))
}
p <- .Call("R_predict", tree, inp, mincriterion, -1L,
PACKAGE = "party")
eoob <- error(p, oob)
for (j in unique(party:::varIDs(tree))) {
for (per in 1:nperm) {
if (conditional || pre1.0_0) {
tmp <- inp
ccl <- create_cond_list(conditional, threshold,
xnames[j], input)
if (is.null(ccl)) {
perm <- sample(which(oob))
} else {
perm <- conditional_perm(ccl, xnames, input,
tree, oob)
}
tmp@variables[[j]][which(oob)] <- tmp@variables[[j]][perm]
p <- .Call("R_predict", tree, tmp, mincriterion,
-1L, PACKAGE = "party")
} else {
p <- .Call("R_predict", tree, inp, mincriterion,
as.integer(j), PACKAGE = "party")
}
perror[(per + (b - 1) * nperm), j] <- (error(p,
oob) - eoob)
}
}
}
perror <- as.data.frame(perror)
return(list(MeanDecreaseAccuracy = colMeans(perror), VIMcTrees=perror))
}
varimp_ctrees(y)$VIMcTrees
nativeSpeaker age shoeSize
1 4.853855 30.06969 52.271824
2 15.740311 70.55825 5.409772
3 17.022082 113.86020 0.000000
4 22.003119 19.62134 50.634286
5 6.070659 28.58817 47.049866
6 16.508634 105.50321 2.302387
7 11.487349 31.80002 46.147677
8 19.250631 27.78282 43.589832
9 19.669478 98.73722 0.483079
10 11.748669 85.95768 5.812538